This data publication contains a spatial database of wildfires that occurred in the United States from 1992 to 2018. It is the fourth update of a publication originally generated to support the national Fire Program Analysis (FPA) system. The wildfire records were acquired from the reporting systems of federal, state, and local fire organizations. The following core data elements were required for records to be included in this data publication: discovery date, final fire size, and a point location at least as precise as a Public Land Survey System (PLSS) section (1-square mile grid). The data were transformed to conform, when possible, to the data standards of the National Wildfire Coordinating Group (NWCG), including an updated wildfire-cause standard (approved August 2020). Basic error-checking was performed and redundant records were identified and removed, to the degree possible. In addition to incorporating data for 2016-2018, some previously unavailable nonfederal wildfire records for the period 1999-2015 were acquired either directly from the state fire services (NH, NJ) or indirectly from an updated National Association of State Foresters database (AR, AZ, CA, CO, FL, HI, ID, IL, OK, SD) and added. The resulting product, referred to as the Fire Program Analysis fire-occurrence database (FPA FOD), includes 2.17 million geo-referenced wildfire records, representing a total of 165 million acres burned during the 27-year period.
library(RSQLite) #for connecting to SQLite database
library(dbplyr) #for connecting to database
library(dplyr) #for data manipulation
library(tidyr) #for tidying data
library(ggthemes) #for visual themes
library(lubridate) #for date conversion
library(chron) #for time conversion
library(magrittr) #call and update with %<>%
library(pastecs) #descriptive stats
library(ggplot2) #for visuals
library(mosaicData) #for correlation matrix
library(ggcorrplot) #for linear regression
library(scales) #for normalizing
library(ggpubr) #for ggarrange
library(viridis) #for color scale
library(hrbrthemes) #themes for ggplot2
import_roboto_condensed() #for font
# create db connection
conn <- dbConnect(SQLite(), "FPA_FOD_20210617.sqlite")
# pull the fires table into RAM
fires <- tbl(conn, "Fires") %>%
collect()
# check size
print(object.size(fires), units = "Gb")
## 0.8 Gb
# disconnect from db
dbDisconnect(conn)
glimpse(fires)
## Rows: 2,166,753
## Columns: 37
## $ FOD_ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1…
## $ FPA_ID <chr> "FS-1418826", "FS-1418827", "FS-1418835"…
## $ SOURCE_SYSTEM_TYPE <chr> "FED", "FED", "FED", "FED", "FED", "FED"…
## $ SOURCE_SYSTEM <chr> "FS-FIRESTAT", "FS-FIRESTAT", "FS-FIREST…
## $ NWCG_REPORTING_AGENCY <chr> "FS", "FS", "FS", "FS", "FS", "FS", "FS"…
## $ NWCG_REPORTING_UNIT_ID <chr> "USCAPNF", "USCAENF", "USCAENF", "USCAEN…
## $ NWCG_REPORTING_UNIT_NAME <chr> "Plumas National Forest", "Eldorado Nati…
## $ SOURCE_REPORTING_UNIT <int> 511, 503, 503, 503, 503, 503, 503, 514, …
## $ SOURCE_REPORTING_UNIT_NAME <chr> "Plumas National Forest", "Eldorado Nati…
## $ LOCAL_FIRE_REPORT_ID <int> 1, 13, 27, 43, 44, 54, 58, 3, 5, 61, 64,…
## $ LOCAL_INCIDENT_ID <chr> "PNF-47", "13", "021", "6", "7", "8", "9…
## $ FIRE_CODE <chr> "BJ8K", "AAC0", "A32W", NA, NA, NA, NA, …
## $ FIRE_NAME <chr> "FOUNTAIN", "PIGEON", "SLACK", "DEER", "…
## $ ICS_209_PLUS_INCIDENT_JOIN_ID <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ ICS_209_PLUS_COMPLEX_JOIN_ID <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ MTBS_ID <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ MTBS_FIRE_NAME <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ COMPLEX_NAME <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ FIRE_YEAR <int> 2005, 2004, 2004, 2004, 2004, 2004, 2004…
## $ DISCOVERY_DATE <chr> "2/2/2005 0:00", "5/12/2004 0:00", "5/31…
## $ DISCOVERY_DOY <int> 33, 133, 152, 180, 180, 182, 183, 67, 74…
## $ DISCOVERY_TIME <int> 1300, 845, 1921, 1600, 1600, 1800, 1800,…
## $ NWCG_CAUSE_CLASSIFICATION <chr> "Human", "Natural", "Human", "Natural", …
## $ NWCG_GENERAL_CAUSE <chr> "Power generation/transmission/distribut…
## $ NWCG_CAUSE_AGE_CATEGORY <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, …
## $ CONT_DATE <chr> "2/2/2005 0:00", "5/12/2004 0:00", "5/31…
## $ CONT_DOY <int> 33, 133, 152, 185, 185, 183, 184, 67, 74…
## $ CONT_TIME <int> 1730, 1530, 2024, 1400, 1200, 1600, 1400…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10…
## $ FIRE_SIZE_CLASS <chr> "A", "A", "A", "A", "A", "A", "A", "B", …
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, …
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.91…
## $ OWNER_DESCR <chr> "USFS", "USFS", "STATE OR PRIVATE", "USF…
## $ STATE <chr> "CA", "CA", "CA", "CA", "CA", "CA", "CA"…
## $ COUNTY <chr> "63", "61", "17", "3", "3", "5", "17", N…
## $ FIPS_CODE <chr> "06063", "06061", "06017", "06003", "060…
## $ FIPS_NAME <chr> "Plumas County", "Placer County", "El Do…
head(fires, n = 20L)
## # A tibble: 20 × 37
## FOD_ID FPA_ID SOURC…¹ SOURC…² NWCG_…³ NWCG_…⁴ NWCG_…⁵ SOURC…⁶ SOURC…⁷ LOCAL…⁸
## <int> <chr> <chr> <chr> <chr> <chr> <chr> <int> <chr> <int>
## 1 1 FS-14… FED FS-FIR… FS USCAPNF Plumas… 511 Plumas… 1
## 2 2 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 13
## 3 3 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 27
## 4 4 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 43
## 5 5 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 44
## 6 6 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 54
## 7 7 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 58
## 8 8 FS-14… FED FS-FIR… FS USCASHF Shasta… 514 Shasta… 3
## 9 9 FS-14… FED FS-FIR… FS USCASHF Shasta… 514 Shasta… 5
## 10 10 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 61
## 11 11 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 64
## 12 12 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 71
## 13 13 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 91
## 14 14 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 99
## 15 15 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 102
## 16 16 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 103
## 17 17 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 109
## 18 18 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 111
## 19 19 FS-14… FED FS-FIR… FS USCAENF Eldora… 503 Eldora… 115
## 20 20 FS-14… FED FS-FIR… FS USNMLNF Lincol… 308 Lincol… 8
## # … with 27 more variables: LOCAL_INCIDENT_ID <chr>, FIRE_CODE <chr>,
## # FIRE_NAME <chr>, ICS_209_PLUS_INCIDENT_JOIN_ID <chr>,
## # ICS_209_PLUS_COMPLEX_JOIN_ID <chr>, MTBS_ID <chr>, MTBS_FIRE_NAME <chr>,
## # COMPLEX_NAME <chr>, FIRE_YEAR <int>, DISCOVERY_DATE <chr>,
## # DISCOVERY_DOY <int>, DISCOVERY_TIME <int>, NWCG_CAUSE_CLASSIFICATION <chr>,
## # NWCG_GENERAL_CAUSE <chr>, NWCG_CAUSE_AGE_CATEGORY <chr>, CONT_DATE <chr>,
## # CONT_DOY <int>, CONT_TIME <int>, FIRE_SIZE <dbl>, FIRE_SIZE_CLASS <chr>, …
fires_na_count <- sapply(fires, function(y) sum(length(which(is.na(y)))))
fires_na_count <- data.frame(fires_na_count)
fires_na_count
## fires_na_count
## FOD_ID 0
## FPA_ID 0
## SOURCE_SYSTEM_TYPE 0
## SOURCE_SYSTEM 0
## NWCG_REPORTING_AGENCY 0
## NWCG_REPORTING_UNIT_ID 0
## NWCG_REPORTING_UNIT_NAME 0
## SOURCE_REPORTING_UNIT 0
## SOURCE_REPORTING_UNIT_NAME 0
## LOCAL_FIRE_REPORT_ID 1701854
## LOCAL_INCIDENT_ID 734948
## FIRE_CODE 1797096
## FIRE_NAME 939607
## ICS_209_PLUS_INCIDENT_JOIN_ID 2135993
## ICS_209_PLUS_COMPLEX_JOIN_ID 2165833
## MTBS_ID 2153848
## MTBS_FIRE_NAME 2153848
## COMPLEX_NAME 2161081
## FIRE_YEAR 0
## DISCOVERY_DATE 0
## DISCOVERY_DOY 0
## DISCOVERY_TIME 754468
## NWCG_CAUSE_CLASSIFICATION 1
## NWCG_GENERAL_CAUSE 0
## NWCG_CAUSE_AGE_CATEGORY 2093127
## CONT_DATE 854553
## CONT_DOY 854553
## CONT_TIME 933151
## FIRE_SIZE 0
## FIRE_SIZE_CLASS 0
## LATITUDE 0
## LONGITUDE 0
## OWNER_DESCR 0
## STATE 0
## COUNTY 657235
## FIPS_CODE 657235
## FIPS_NAME 657236
# select columns we plan on using.
fires_new <- fires %>%
select(FOD_ID, FIRE_YEAR, DISCOVERY_DATE, DISCOVERY_DOY, DISCOVERY_TIME, NWCG_CAUSE_CLASSIFICATION,
NWCG_GENERAL_CAUSE, CONT_DATE, CONT_DOY, CONT_TIME, FIRE_SIZE, FIRE_SIZE_CLASS,
LATITUDE, LONGITUDE, STATE, FIPS_NAME)
# rename column
fires_new <- rename(fires_new, COUNTY = FIPS_NAME)
glimpse(fires_new)
## Rows: 2,166,753
## Columns: 16
## $ FOD_ID <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1…
## $ FIRE_YEAR <int> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <chr> "2/2/2005 0:00", "5/12/2004 0:00", "5/31/200…
## $ DISCOVERY_DOY <int> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <int> 1300, 845, 1921, 1600, 1600, 1800, 1800, 130…
## $ NWCG_CAUSE_CLASSIFICATION <chr> "Human", "Natural", "Human", "Natural", "Nat…
## $ NWCG_GENERAL_CAUSE <chr> "Power generation/transmission/distribution"…
## $ CONT_DATE <chr> "2/2/2005 0:00", "5/12/2004 0:00", "5/31/200…
## $ CONT_DOY <int> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <int> 1730, 1530, 2024, 1400, 1200, 1600, 1400, 16…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <chr> "A", "A", "A", "A", "A", "A", "A", "B", "B",…
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, 38.5…
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.9133, …
## $ STATE <chr> "CA", "CA", "CA", "CA", "CA", "CA", "CA", "C…
## $ COUNTY <chr> "Plumas County", "Placer County", "El Dorado…
# convert dates
fires_new$DISCOVERY_DATE <- as.Date(fires_new$DISCOVERY_DATE, format = "%m/%d/%Y %H:%M")
fires_new$CONT_DATE <- as.Date(fires_new$CONT_DATE, format = "%m/%d/%Y %H:%M")
# convert times
fires_new$DISCOVERY_TIME <- times(sub("(.{2})", "\\1:", sprintf("%04d:00", fires_new$DISCOVERY_TIME)))
fires_new$CONT_TIME <- times(sub("(.{2})", "\\1:", sprintf("%04d:00", fires_new$CONT_TIME)))
# convert ID to chr
fires_new %<>%
mutate_at("FOD_ID", as.character)
# check list of states in data
unique_states <- unique(fires_new$STATE)
unique_states
## [1] "CA" "NM" "OR" "NC" "WY" "CO" "WA" "MT" "UT" "AZ" "SD" "AR" "NV" "ID" "MN"
## [16] "TX" "FL" "SC" "LA" "OK" "KS" "MO" "NE" "MI" "KY" "OH" "IN" "VA" "IL" "TN"
## [31] "GA" "AK" "ND" "WV" "WI" "AL" "NH" "PA" "MS" "ME" "VT" "NY" "IA" "DC" "MD"
## [46] "CT" "MA" "NJ" "HI" "DE" "PR" "RI"
# created a list of (east/west) regions by state in csv for contiguous 48
# states to join to fires_new
regions <- read.csv("regions.csv")
# merge regions
fires_new <- left_join(fires_new, regions, by = c(STATE = "STATE"))
# remove na in REGION since I'm only interested in contiguous US
fires_new <- fires_new[!is.na(fires_new$REGION), ]
# check na count
fires_new_na_count <- sapply(fires_new, function(y) sum(length(which(is.na(y)))))
fires_new_na_count <- data.frame(fires_new_na_count)
fires_new_na_count
## fires_new_na_count
## FOD_ID 0
## FIRE_YEAR 0
## DISCOVERY_DATE 0
## DISCOVERY_DOY 0
## DISCOVERY_TIME 714898
## NWCG_CAUSE_CLASSIFICATION 1
## NWCG_GENERAL_CAUSE 0
## CONT_DATE 818560
## CONT_DOY 818560
## CONT_TIME 893634
## FIRE_SIZE 0
## FIRE_SIZE_CLASS 0
## LATITUDE 0
## LONGITUDE 0
## STATE 0
## COUNTY 621616
## REGION 0
glimpse(fires_new)
## Rows: 2,120,440
## Columns: 17
## $ FOD_ID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9",…
## $ FIRE_YEAR <int> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-06…
## $ DISCOVERY_DOY <int> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <times> 13:00:00, 08:45:00, 19:21:00, 16:00:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <chr> "Human", "Natural", "Human", "Natural", "Nat…
## $ NWCG_GENERAL_CAUSE <chr> "Power generation/transmission/distribution"…
## $ CONT_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-07…
## $ CONT_DOY <int> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <times> 17:30:00, 15:30:00, 20:24:00, 14:00:00, 12…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <chr> "A", "A", "A", "A", "A", "A", "A", "B", "B",…
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, 38.5…
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.9133, …
## $ STATE <chr> "CA", "CA", "CA", "CA", "CA", "CA", "CA", "C…
## $ COUNTY <chr> "Plumas County", "Placer County", "El Dorado…
## $ REGION <chr> "West", "West", "West", "West", "West", "Wes…
# check FOD_ID is unique using unique_id function from
# https://rdrr.io/github/EdwinTh/thatssorandom/src/R/unique_id.R
unique_id <- function(x, ...) {
id_set <- x %>%
select(...)
id_set_dist <- id_set %>%
distinct
if (nrow(id_set) == nrow(id_set_dist)) {
TRUE
} else {
non_unique_ids <- id_set %>%
filter(id_set %>%
duplicated()) %>%
distinct()
suppressMessages(inner_join(non_unique_ids, x) %>%
arrange(...))
}
}
fires_new %>%
unique_id(FOD_ID)
## [1] TRUE
glimpse(fires_new)
## Rows: 2,120,440
## Columns: 17
## $ FOD_ID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9",…
## $ FIRE_YEAR <int> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-06…
## $ DISCOVERY_DOY <int> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <times> 13:00:00, 08:45:00, 19:21:00, 16:00:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <chr> "Human", "Natural", "Human", "Natural", "Nat…
## $ NWCG_GENERAL_CAUSE <chr> "Power generation/transmission/distribution"…
## $ CONT_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-07…
## $ CONT_DOY <int> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <times> 17:30:00, 15:30:00, 20:24:00, 14:00:00, 12…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <chr> "A", "A", "A", "A", "A", "A", "A", "B", "B",…
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, 38.5…
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.9133, …
## $ STATE <chr> "CA", "CA", "CA", "CA", "CA", "CA", "CA", "C…
## $ COUNTY <chr> "Plumas County", "Placer County", "El Dorado…
## $ REGION <chr> "West", "West", "West", "West", "West", "Wes…
# convert categorical variables to factors.
factor_cols <- c("FIRE_YEAR", "DISCOVERY_DOY", "NWCG_CAUSE_CLASSIFICATION", "NWCG_GENERAL_CAUSE",
"CONT_DOY", "FIRE_SIZE_CLASS", "STATE", "COUNTY", "REGION")
fires_new %<>%
mutate_at(factor_cols, factor)
levels(fires_new$NWCG_CAUSE_CLASSIFICATION)
## [1] "Human"
## [2] "Missing data/not specified/undetermined"
## [3] "Natural"
levels(fires_new$NWCG_GENERAL_CAUSE)
## [1] "Arson/incendiarism"
## [2] "Debris and open burning"
## [3] "Equipment and vehicle use"
## [4] "Firearms and explosives use"
## [5] "Fireworks"
## [6] "Missing data/not specified/undetermined"
## [7] "Misuse of fire by a minor"
## [8] "Natural"
## [9] "Other causes"
## [10] "Power generation/transmission/distribution"
## [11] "Railroad operations and maintenance"
## [12] "Recreation and ceremony"
## [13] "Smoking"
# sum number of special values
is.special <- function(x) {
if (is.numeric(x))
!is.finite(x) else is.na(x)
}
sum(sapply(fires_new, is.special))
## [1] 3867269
# check na count
fires_new_na_count <- sapply(fires_new, function(y) sum(length(which(is.na(y)))))
fires_new_na_count <- data.frame(fires_new_na_count)
print(fires_new_na_count)
## fires_new_na_count
## FOD_ID 0
## FIRE_YEAR 0
## DISCOVERY_DATE 0
## DISCOVERY_DOY 0
## DISCOVERY_TIME 714898
## NWCG_CAUSE_CLASSIFICATION 1
## NWCG_GENERAL_CAUSE 0
## CONT_DATE 818560
## CONT_DOY 818560
## CONT_TIME 893634
## FIRE_SIZE 0
## FIRE_SIZE_CLASS 0
## LATITUDE 0
## LONGITUDE 0
## STATE 0
## COUNTY 621616
## REGION 0
# create subset cause_class where na
cause_class_na <- fires_new[is.na(fires_new$NWCG_CAUSE_CLASSIFICATION), ]
cause_class_na
## # A tibble: 1 × 17
## FOD_ID FIRE_…¹ DISCOVER…² DISCO…³ DISCO…⁴ NWCG_…⁵ NWCG_…⁶ CONT_DATE CONT_DOY
## <chr> <fct> <date> <fct> <times> <fct> <fct> <date> <fct>
## 1 4004820… 2017 2017-03-04 63 13:00:… <NA> Missin… NA <NA>
## # … with 8 more variables: CONT_TIME <times>, FIRE_SIZE <dbl>,
## # FIRE_SIZE_CLASS <fct>, LATITUDE <dbl>, LONGITUDE <dbl>, STATE <fct>,
## # COUNTY <fct>, REGION <fct>, and abbreviated variable names ¹FIRE_YEAR,
## # ²DISCOVERY_DATE, ³DISCOVERY_DOY, ⁴DISCOVERY_TIME,
## # ⁵NWCG_CAUSE_CLASSIFICATION, ⁶NWCG_GENERAL_CAUSE
# subset all general_cause with missing data
general_cause_missing <- fires_new[which(fires_new$NWCG_GENERAL_CAUSE == "Missing data/not specified/undetermined"),
]
general_cause_missing
## # A tibble: 513,433 × 17
## FOD_ID FIRE_Y…¹ DISCOVER…² DISCO…³ DISCO…⁴ NWCG_…⁵ NWCG_…⁶ CONT_DATE CONT_…⁷
## <chr> <fct> <date> <fct> <times> <fct> <fct> <date> <fct>
## 1 13 2004 2004-09-03 247 16:00:… Human Missin… 2004-09-03 247
## 2 47 2005 2005-03-11 70 10:00:… Human Missin… 2005-03-11 70
## 3 52 2005 2005-05-17 137 14:00:… Human Missin… 2005-05-17 137
## 4 54 2005 2005-04-07 97 16:15:… Human Missin… 2005-04-07 97
## 5 66 2005 2005-04-24 114 19:30:… Human Missin… 2005-04-24 114
## 6 74 2005 2005-07-01 182 20:30:… Human Missin… 2005-07-01 182
## 7 76 2005 2005-07-03 184 17:45:… Human Missin… 2005-07-03 184
## 8 150 2005 2005-05-02 122 16:28:… Human Missin… 2005-05-02 122
## 9 151 2005 2005-06-21 172 11:41:… Human Missin… 2005-06-21 172
## 10 187 2005 2005-07-18 199 14:56:… Human Missin… 2005-07-18 199
## # … with 513,423 more rows, 8 more variables: CONT_TIME <times>,
## # FIRE_SIZE <dbl>, FIRE_SIZE_CLASS <fct>, LATITUDE <dbl>, LONGITUDE <dbl>,
## # STATE <fct>, COUNTY <fct>, REGION <fct>, and abbreviated variable names
## # ¹FIRE_YEAR, ²DISCOVERY_DATE, ³DISCOVERY_DOY, ⁴DISCOVERY_TIME,
## # ⁵NWCG_CAUSE_CLASSIFICATION, ⁶NWCG_GENERAL_CAUSE, ⁷CONT_DOY
# subset all cause_class with missing data
cause_class_missing <- fires_new[which(fires_new$NWCG_CAUSE_CLASSIFICATION == "Missing data/not specified/undetermined"),
]
cause_class_missing
## # A tibble: 142,131 × 17
## FOD_ID FIRE_Y…¹ DISCOVER…² DISCO…³ DISCO…⁴ NWCG_…⁵ NWCG_…⁶ CONT_DATE CONT_…⁷
## <chr> <fct> <date> <fct> <times> <fct> <fct> <date> <fct>
## 1 192538 1999 1999-09-13 256 18:00:… Missin… Missin… NA <NA>
## 2 201350 1998 1998-06-30 181 16:00:… Missin… Missin… NA <NA>
## 3 214669 1992 1992-06-25 177 19:38:… Missin… Missin… 1992-06-25 177
## 4 214877 1994 1994-08-08 220 09:30:… Missin… Missin… 1994-08-08 220
## 5 214937 1995 1995-06-03 154 11:00:… Missin… Missin… 1995-06-03 154
## 6 214938 1995 1995-06-03 154 11:00:… Missin… Missin… 1995-06-03 154
## 7 216231 1994 1994-06-23 174 10:00:… Missin… Missin… 1994-06-23 174
## 8 216232 1994 1994-06-23 174 10:10:… Missin… Missin… 1994-06-23 174
## 9 216263 1995 1995-05-10 130 13:00:… Missin… Missin… 1995-05-10 130
## 10 216149 1992 1992-08-11 224 16:48:… Missin… Missin… 1992-08-11 224
## # … with 142,121 more rows, 8 more variables: CONT_TIME <times>,
## # FIRE_SIZE <dbl>, FIRE_SIZE_CLASS <fct>, LATITUDE <dbl>, LONGITUDE <dbl>,
## # STATE <fct>, COUNTY <fct>, REGION <fct>, and abbreviated variable names
## # ¹FIRE_YEAR, ²DISCOVERY_DATE, ³DISCOVERY_DOY, ⁴DISCOVERY_TIME,
## # ⁵NWCG_CAUSE_CLASSIFICATION, ⁶NWCG_GENERAL_CAUSE, ⁷CONT_DOY
# replace na in NWCG_CAUSE_CLASSIFICATION
fires_new$NWCG_CAUSE_CLASSIFICATION[is.na(fires_new$NWCG_CAUSE_CLASSIFICATION)] <- "Missing data/not specified/undetermined"
# outliers in cont_date
dtc_large_values <- fires_new[which(fires_new$DAYS_TO_CONT > 9490), ]
dtc_large_values
## # A tibble: 0 × 17
## # … with 17 variables: FOD_ID <chr>, FIRE_YEAR <fct>, DISCOVERY_DATE <date>,
## # DISCOVERY_DOY <fct>, DISCOVERY_TIME <times>,
## # NWCG_CAUSE_CLASSIFICATION <fct>, NWCG_GENERAL_CAUSE <fct>,
## # CONT_DATE <date>, CONT_DOY <fct>, CONT_TIME <times>, FIRE_SIZE <dbl>,
## # FIRE_SIZE_CLASS <fct>, LATITUDE <dbl>, LONGITUDE <dbl>, STATE <fct>,
## # COUNTY <fct>, REGION <fct>
cont_future_dates <- fires_new[which(fires_new$CONT_DATE > today()), ]
cont_future_dates
## # A tibble: 39 × 17
## FOD_ID FIRE_…¹ DISCOVER…² DISCO…³ DISCO…⁴ NWCG_…⁵ NWCG_…⁶ CONT_DATE CONT_…⁷
## <chr> <fct> <date> <fct> <times> <fct> <fct> <date> <fct>
## 1 578071 2008 2008-04-23 114 17:21:… Natural Natural 2023-04-23 113
## 2 642658 2006 2006-04-18 108 00:00:… Human Missin… 2045-06-21 172
## 3 642680 2006 2006-04-18 108 00:00:… Human Arson/… 2030-06-18 169
## 4 719640 2006 2006-06-30 181 18:29:… Natural Natural 2060-06-30 182
## 5 1336443 2001 2001-06-19 170 19:19:… Human Equipm… 2100-06-19 170
## 6 201145… 2011 2011-12-08 342 17:35:… Human Debris… 2022-12-08 342
## 7 201168… 2011 2011-02-15 46 10:30:… Human Equipm… 9999-12-31 365
## 8 201168… 2011 2011-04-06 96 10:30:… Human Debris… 9999-12-31 365
## 9 201607… 2012 2012-07-07 189 16:48:… Human Equipm… 6476-07-01 183
## 10 201608… 2012 2012-08-27 240 18:30:… Natural Natural 5012-08-27 240
## # … with 29 more rows, 8 more variables: CONT_TIME <times>, FIRE_SIZE <dbl>,
## # FIRE_SIZE_CLASS <fct>, LATITUDE <dbl>, LONGITUDE <dbl>, STATE <fct>,
## # COUNTY <fct>, REGION <fct>, and abbreviated variable names ¹FIRE_YEAR,
## # ²DISCOVERY_DATE, ³DISCOVERY_DOY, ⁴DISCOVERY_TIME,
## # ⁵NWCG_CAUSE_CLASSIFICATION, ⁶NWCG_GENERAL_CAUSE, ⁷CONT_DOY
# replace future cont_date with na
fires_new$CONT_DATE[fires_new$CONT_DATE > today()] <- NA
# create new column discovery to containment days
fires_new$DAYS_TO_CONT <- as.numeric(difftime(fires_new$CONT_DATE, fires_new$DISCOVERY_DATE),
units = "days")
# outliers in days_to_cont (longest burning wildfire in guinness book of
# records is 5 months roughly 150 days)
dtc_large_values <- fires_new[which(fires_new$DAYS_TO_CONT > 150), ]
dtc_large_values
## # A tibble: 484 × 18
## FOD_ID FIRE_Y…¹ DISCOVER…² DISCO…³ DISCO…⁴ NWCG_…⁵ NWCG_…⁶ CONT_DATE CONT_…⁷
## <chr> <fct> <date> <fct> <times> <fct> <fct> <date> <fct>
## 1 17669 2006 2006-07-09 190 20:05:… Natural Natural 2006-12-11 345
## 2 17677 2006 2006-07-09 190 20:05:… Natural Natural 2006-12-11 345
## 3 18397 2007 2007-01-01 1 10:00:… Human Arson/… 2007-06-11 162
## 4 25401 2007 2007-08-05 217 19:00:… Natural Natural 2008-01-07 7
## 5 27934 2008 2008-06-22 174 08:30:… Natural Natural 2008-12-16 351
## 6 29452 2008 2008-07-07 189 17:00:… Natural Natural 2008-12-16 351
## 7 31234 2008 2008-03-11 71 10:00:… Human Recrea… 2008-09-11 255
## 8 33716 2008 2008-06-22 174 12:30:… Natural Natural 2008-11-20 325
## 9 34942 2008 2008-04-27 118 16:30:… Natural Natural 2009-04-27 117
## 10 39338 2009 2009-01-18 18 15:30:… Human Arson/… 2009-08-19 231
## # … with 474 more rows, 9 more variables: CONT_TIME <times>, FIRE_SIZE <dbl>,
## # FIRE_SIZE_CLASS <fct>, LATITUDE <dbl>, LONGITUDE <dbl>, STATE <fct>,
## # COUNTY <fct>, REGION <fct>, DAYS_TO_CONT <dbl>, and abbreviated variable
## # names ¹FIRE_YEAR, ²DISCOVERY_DATE, ³DISCOVERY_DOY, ⁴DISCOVERY_TIME,
## # ⁵NWCG_CAUSE_CLASSIFICATION, ⁶NWCG_GENERAL_CAUSE, ⁷CONT_DOY
# replace days_to_cont > 150 with NA
fires_new$DAYS_TO_CONT[fires_new$DAYS_TO_CONT > 150] <- NA
glimpse(fires_new)
## Rows: 2,120,440
## Columns: 18
## $ FOD_ID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9",…
## $ FIRE_YEAR <fct> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-06…
## $ DISCOVERY_DOY <fct> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <times> 13:00:00, 08:45:00, 19:21:00, 16:00:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <fct> Human, Natural, Human, Natural, Natural, Nat…
## $ NWCG_GENERAL_CAUSE <fct> Power generation/transmission/distribution, …
## $ CONT_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-07…
## $ CONT_DOY <fct> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <times> 17:30:00, 15:30:00, 20:24:00, 14:00:00, 12…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <fct> A, A, A, A, A, A, A, B, B, A, A, A, A, B, A,…
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, 38.5…
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.9133, …
## $ STATE <fct> CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, …
## $ COUNTY <fct> Plumas County, Placer County, El Dorado Coun…
## $ REGION <fct> West, West, West, West, West, West, West, We…
## $ DAYS_TO_CONT <dbl> 0, 0, 0, 5, 5, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
# check na
fires_new_na_count <- sapply(fires_new, function(y) sum(length(which(is.na(y)))))
fires_new_na_count <- data.frame(fires_new_na_count)
fires_new_na_count
## fires_new_na_count
## FOD_ID 0
## FIRE_YEAR 0
## DISCOVERY_DATE 0
## DISCOVERY_DOY 0
## DISCOVERY_TIME 714898
## NWCG_CAUSE_CLASSIFICATION 0
## NWCG_GENERAL_CAUSE 0
## CONT_DATE 818599
## CONT_DOY 818560
## CONT_TIME 893634
## FIRE_SIZE 0
## FIRE_SIZE_CLASS 0
## LATITUDE 0
## LONGITUDE 0
## STATE 0
## COUNTY 621616
## REGION 0
## DAYS_TO_CONT 819083
fires_new %>%
count(DAYS_TO_CONT)
## # A tibble: 152 × 2
## DAYS_TO_CONT n
## <dbl> <int>
## 1 0 1089878
## 2 1 120743
## 3 2 29253
## 4 3 14500
## 5 4 8173
## 6 5 5746
## 7 6 4280
## 8 7 3583
## 9 8 2549
## 10 9 2097
## # … with 142 more rows
# replace na values in days_to_cont by state/fire_size_class group medians and
# round them
fires_new <- fires_new %>%
group_by(FIRE_SIZE_CLASS, STATE) %>%
mutate(DAYS_TO_CONT = ifelse(is.na(DAYS_TO_CONT), round(median(DAYS_TO_CONT,
na.rm = TRUE)), DAYS_TO_CONT)) %>%
ungroup()
glimpse(fires_new)
## Rows: 2,120,440
## Columns: 18
## $ FOD_ID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9",…
## $ FIRE_YEAR <fct> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-06…
## $ DISCOVERY_DOY <fct> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <times> 13:00:00, 08:45:00, 19:21:00, 16:00:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <fct> Human, Natural, Human, Natural, Natural, Nat…
## $ NWCG_GENERAL_CAUSE <fct> Power generation/transmission/distribution, …
## $ CONT_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-07…
## $ CONT_DOY <fct> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <times> 17:30:00, 15:30:00, 20:24:00, 14:00:00, 12…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <fct> A, A, A, A, A, A, A, B, B, A, A, A, A, B, A,…
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, 38.5…
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.9133, …
## $ STATE <fct> CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, …
## $ COUNTY <fct> Plumas County, Placer County, El Dorado Coun…
## $ REGION <fct> West, West, West, West, West, West, West, We…
## $ DAYS_TO_CONT <dbl> 0, 0, 0, 5, 5, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
# check na
fires_new_na_count <- sapply(fires_new, function(y) sum(length(which(is.na(y)))))
fires_new_na_count <- data.frame(fires_new_na_count)
fires_new_na_count
## fires_new_na_count
## FOD_ID 0
## FIRE_YEAR 0
## DISCOVERY_DATE 0
## DISCOVERY_DOY 0
## DISCOVERY_TIME 714898
## NWCG_CAUSE_CLASSIFICATION 0
## NWCG_GENERAL_CAUSE 0
## CONT_DATE 818599
## CONT_DOY 818560
## CONT_TIME 893634
## FIRE_SIZE 0
## FIRE_SIZE_CLASS 0
## LATITUDE 0
## LONGITUDE 0
## STATE 0
## COUNTY 621616
## REGION 0
## DAYS_TO_CONT 6
fires_new %>%
count(DAYS_TO_CONT)
## # A tibble: 152 × 2
## DAYS_TO_CONT n
## <dbl> <int>
## 1 0 1870956
## 2 1 149831
## 3 2 35682
## 4 3 15084
## 5 4 9186
## 6 5 5912
## 7 6 4617
## 8 7 3648
## 9 8 2568
## 10 9 2106
## # … with 142 more rows
# subset remaining na's to review
days_to_cont_na <- fires_new[is.na(fires_new$DAYS_TO_CONT), ]
days_to_cont_na
## # A tibble: 6 × 18
## FOD_ID FIRE_…¹ DISCOVER…² DISCO…³ DISCO…⁴ NWCG_…⁵ NWCG_…⁶ CONT_DATE CONT_DOY
## <chr> <fct> <date> <fct> <times> <fct> <fct> <date> <fct>
## 1 373439 1999 1999-04-17 107 09:00:… Missin… Missin… NA <NA>
## 2 1833591 1995 1995-08-24 236 <NA> Human Arson/… NA <NA>
## 3 15042262 1997 1997-10-22 295 22:01:… Human Missin… NA <NA>
## 4 2018244… 2013 2013-05-14 134 14:45:… Human Equipm… NA <NA>
## 5 4000040… 2016 2016-11-12 317 15:30:… Human Missin… NA <NA>
## 6 4000364… 2016 2016-09-10 254 <NA> Natural Natural NA <NA>
## # … with 9 more variables: CONT_TIME <times>, FIRE_SIZE <dbl>,
## # FIRE_SIZE_CLASS <fct>, LATITUDE <dbl>, LONGITUDE <dbl>, STATE <fct>,
## # COUNTY <fct>, REGION <fct>, DAYS_TO_CONT <dbl>, and abbreviated variable
## # names ¹FIRE_YEAR, ²DISCOVERY_DATE, ³DISCOVERY_DOY, ⁴DISCOVERY_TIME,
## # ⁵NWCG_CAUSE_CLASSIFICATION, ⁶NWCG_GENERAL_CAUSE
# replace remaining na values in days_to_cont by fire_size_class group median
# only and round them
fires_new <- fires_new %>%
group_by(FIRE_SIZE_CLASS) %>%
mutate(DAYS_TO_CONT = ifelse(is.na(DAYS_TO_CONT), round(median(DAYS_TO_CONT,
na.rm = TRUE)), DAYS_TO_CONT)) %>%
ungroup()
# check na
fires_new_na_count <- sapply(fires_new, function(y) sum(length(which(is.na(y)))))
fires_new_na_count <- data.frame(fires_new_na_count)
fires_new_na_count
## fires_new_na_count
## FOD_ID 0
## FIRE_YEAR 0
## DISCOVERY_DATE 0
## DISCOVERY_DOY 0
## DISCOVERY_TIME 714898
## NWCG_CAUSE_CLASSIFICATION 0
## NWCG_GENERAL_CAUSE 0
## CONT_DATE 818599
## CONT_DOY 818560
## CONT_TIME 893634
## FIRE_SIZE 0
## FIRE_SIZE_CLASS 0
## LATITUDE 0
## LONGITUDE 0
## STATE 0
## COUNTY 621616
## REGION 0
## DAYS_TO_CONT 0
# check null
fires_new_null_count <- sapply(fires_new, function(y) sum(length(which(is.null(y)))))
fires_new_null_count <- data.frame(fires_new_null_count)
fires_new_null_count
## fires_new_null_count
## FOD_ID 0
## FIRE_YEAR 0
## DISCOVERY_DATE 0
## DISCOVERY_DOY 0
## DISCOVERY_TIME 0
## NWCG_CAUSE_CLASSIFICATION 0
## NWCG_GENERAL_CAUSE 0
## CONT_DATE 0
## CONT_DOY 0
## CONT_TIME 0
## FIRE_SIZE 0
## FIRE_SIZE_CLASS 0
## LATITUDE 0
## LONGITUDE 0
## STATE 0
## COUNTY 0
## REGION 0
## DAYS_TO_CONT 0
glimpse(fires_new)
## Rows: 2,120,440
## Columns: 18
## $ FOD_ID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9",…
## $ FIRE_YEAR <fct> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-06…
## $ DISCOVERY_DOY <fct> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <times> 13:00:00, 08:45:00, 19:21:00, 16:00:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <fct> Human, Natural, Human, Natural, Natural, Nat…
## $ NWCG_GENERAL_CAUSE <fct> Power generation/transmission/distribution, …
## $ CONT_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-07…
## $ CONT_DOY <fct> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <times> 17:30:00, 15:30:00, 20:24:00, 14:00:00, 12…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <fct> A, A, A, A, A, A, A, B, B, A, A, A, A, B, A,…
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, 38.5…
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.9133, …
## $ STATE <fct> CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, …
## $ COUNTY <fct> Plumas County, Placer County, El Dorado Coun…
## $ REGION <fct> West, West, West, West, West, West, West, We…
## $ DAYS_TO_CONT <dbl> 0, 0, 0, 5, 5, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
# create CA df
fires_new_ca <- subset(fires_new, STATE == "CA")
glimpse(fires_new_ca)
## Rows: 235,229
## Columns: 18
## $ FOD_ID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9",…
## $ FIRE_YEAR <fct> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-06…
## $ DISCOVERY_DOY <fct> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <times> 13:00:00, 08:45:00, 19:21:00, 16:00:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <fct> Human, Natural, Human, Natural, Natural, Nat…
## $ NWCG_GENERAL_CAUSE <fct> Power generation/transmission/distribution, …
## $ CONT_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-07…
## $ CONT_DOY <fct> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <times> 17:30:00, 15:30:00, 20:24:00, 14:00:00, 12…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <fct> A, A, A, A, A, A, A, B, B, A, A, A, A, B, A,…
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, 38.5…
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.9133, …
## $ STATE <fct> CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, …
## $ COUNTY <fct> Plumas County, Placer County, El Dorado Coun…
## $ REGION <fct> West, West, West, West, West, West, West, We…
## $ DAYS_TO_CONT <dbl> 0, 0, 0, 5, 5, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
# check na
fires_new_ca_na_count <- sapply(fires_new_ca, function(y) sum(length(which(is.na(y)))))
fires_new_ca_na_count <- data.frame(fires_new_ca_na_count)
print(fires_new_ca_na_count)
## fires_new_ca_na_count
## FOD_ID 0
## FIRE_YEAR 0
## DISCOVERY_DATE 0
## DISCOVERY_DOY 0
## DISCOVERY_TIME 43080
## NWCG_CAUSE_CLASSIFICATION 0
## NWCG_GENERAL_CAUSE 0
## CONT_DATE 103889
## CONT_DOY 103888
## CONT_TIME 104527
## FIRE_SIZE 0
## FIRE_SIZE_CLASS 0
## LATITUDE 0
## LONGITUDE 0
## STATE 0
## COUNTY 92678
## REGION 0
## DAYS_TO_CONT 0
# create West df
fires_new_west <- subset(fires_new, REGION == "West")
glimpse(fires_new_west)
## Rows: 990,281
## Columns: 18
## $ FOD_ID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9",…
## $ FIRE_YEAR <fct> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-06…
## $ DISCOVERY_DOY <fct> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <times> 13:00:00, 08:45:00, 19:21:00, 16:00:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <fct> Human, Natural, Human, Natural, Natural, Nat…
## $ NWCG_GENERAL_CAUSE <fct> Power generation/transmission/distribution, …
## $ CONT_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-07…
## $ CONT_DOY <fct> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <times> 17:30:00, 15:30:00, 20:24:00, 14:00:00, 12…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <fct> A, A, A, A, A, A, A, B, B, A, A, A, A, B, A,…
## $ LATITUDE <dbl> 40.03694, 38.93306, 38.98417, 38.55917, 38.5…
## $ LONGITUDE <dbl> -121.0058, -120.4044, -120.7356, -119.9133, …
## $ STATE <fct> CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, …
## $ COUNTY <fct> Plumas County, Placer County, El Dorado Coun…
## $ REGION <fct> West, West, West, West, West, West, West, We…
## $ DAYS_TO_CONT <dbl> 0, 0, 0, 5, 5, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
# create East df
fires_new_east <- subset(fires_new, REGION == "East")
glimpse(fires_new_east)
## Rows: 1,130,159
## Columns: 18
## $ FOD_ID <chr> "37", "38", "39", "40", "41", "42", "43", "4…
## $ FIRE_YEAR <fct> 2005, 2005, 2005, 2005, 2005, 2005, 2005, 20…
## $ DISCOVERY_DATE <date> 2005-03-11, 2005-01-27, 2005-02-06, 2005-02…
## $ DISCOVERY_DOY <fct> 70, 27, 37, 43, 106, 129, 129, 70, 180, 195,…
## $ DISCOVERY_TIME <times> 15:00:00, 22:00:00, 19:00:00, 15:20:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <fct> Human, Human, Human, Human, Human, Human, Hu…
## $ NWCG_GENERAL_CAUSE <fct> Equipment and vehicle use, Arson/incendiaris…
## $ CONT_DATE <date> 2005-03-11, 2005-01-28, 2005-02-06, 2005-02…
## $ CONT_DOY <fct> 70, 28, 37, 44, 106, 129, 129, 70, 180, 196,…
## $ CONT_TIME <times> 18:00:00, 03:00:00, 20:00:00, 03:30:00, 22…
## $ FIRE_SIZE <dbl> 0.60, 50.30, 0.10, 125.00, 25.00, 3.00, 10.0…
## $ FIRE_SIZE_CLASS <fct> B, C, A, D, C, B, C, C, A, A, A, A, A, A, A,…
## $ LATITUDE <dbl> 35.22833, 35.00028, 35.93167, 36.00167, 35.9…
## $ LONGITUDE <dbl> -82.88444, -83.35111, -81.71667, -81.59000, …
## $ STATE <fct> NC, NC, NC, NC, NC, NC, NC, NC, AR, MN, MN, …
## $ COUNTY <fct> Buncombe County, Macon County, Caldwell Coun…
## $ REGION <fct> East, East, East, East, East, East, East, Ea…
## $ DAYS_TO_CONT <dbl> 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,…
##Import US (48 states) Drought Indicator Data (5 year SPEI) 1992 - 2020 from CSV
us_SPEI <- read.csv("drought_fig-2_US5SPEI.csv")
glimpse(us_SPEI)
## Rows: 29
## Columns: 2
## $ Year <int> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2…
## $ Five.year.SPEI.value <dbl> -0.29736869, 0.32903045, 0.44245068, 0.74043491, …
us_SPEI %<>%
mutate_at("Year", factor)
glimpse(us_SPEI)
## Rows: 29
## Columns: 2
## $ Year <fct> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2…
## $ Five.year.SPEI.value <dbl> -0.29736869, 0.32903045, 0.44245068, 0.74043491, …
# rename column
us_SPEI <- rename(us_SPEI, US_5yr_SPEI = Five.year.SPEI.value)
glimpse(us_SPEI)
## Rows: 29
## Columns: 2
## $ Year <fct> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001…
## $ US_5yr_SPEI <dbl> -0.29736869, 0.32903045, 0.44245068, 0.74043491, 0.7170902…
##Import CA Drought Indicator Data (5 year SPEI) 1992 - 2021 from CSV
ca_SPEI <- read.csv("CA_5SPEI.csv")
glimpse(ca_SPEI)
## Rows: 30
## Columns: 2
## $ Year <int> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2…
## $ Five.year.SPEI.value <dbl> -1.19, -0.52, -0.81, 0.28, 0.36, 0.60, 0.96, 1.25…
ca_SPEI %<>%
mutate_at("Year", factor)
glimpse(ca_SPEI)
## Rows: 30
## Columns: 2
## $ Year <fct> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2…
## $ Five.year.SPEI.value <dbl> -1.19, -0.52, -0.81, 0.28, 0.36, 0.60, 0.96, 1.25…
ca_SPEI <- rename(ca_SPEI, CA_5yr_SPEI = Five.year.SPEI.value)
glimpse(ca_SPEI)
## Rows: 30
## Columns: 2
## $ Year <fct> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001…
## $ CA_5yr_SPEI <dbl> -1.19, -0.52, -0.81, 0.28, 0.36, 0.60, 0.96, 1.25, 0.67, 0…
combined_SPEI <- merge(us_SPEI, ca_SPEI, by = "Year", all = TRUE)
glimpse(combined_SPEI)
## Rows: 30
## Columns: 3
## $ Year <fct> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001…
## $ US_5yr_SPEI <dbl> -0.29736869, 0.32903045, 0.44245068, 0.74043491, 0.7170902…
## $ CA_5yr_SPEI <dbl> -1.19, -0.52, -0.81, 0.28, 0.36, 0.60, 0.96, 1.25, 0.67, 0…
# Reshape data frame
combined_SPEI_group <- data.frame(Year = combined_SPEI$Year, SPEI = c(combined_SPEI$US_5yr_SPEI,
combined_SPEI$CA_5yr_SPEI), Location = c(rep("US_5yr_SPEI", nrow(combined_SPEI)),
rep("CA_5yr_SPEI", nrow(combined_SPEI))))
combined_SPEI_group
## Year SPEI Location
## 1 1992 -0.29736869 US_5yr_SPEI
## 2 1993 0.32903045 US_5yr_SPEI
## 3 1994 0.44245068 US_5yr_SPEI
## 4 1995 0.74043491 US_5yr_SPEI
## 5 1996 0.71709022 US_5yr_SPEI
## 6 1997 1.06105053 US_5yr_SPEI
## 7 1998 0.93202471 US_5yr_SPEI
## 8 1999 0.90186513 US_5yr_SPEI
## 9 2000 0.41446862 US_5yr_SPEI
## 10 2001 0.21723003 US_5yr_SPEI
## 11 2002 -0.38721720 US_5yr_SPEI
## 12 2003 -0.60775797 US_5yr_SPEI
## 13 2004 -0.71237166 US_5yr_SPEI
## 14 2005 -0.25630079 US_5yr_SPEI
## 15 2006 -0.32339736 US_5yr_SPEI
## 16 2007 -0.11457071 US_5yr_SPEI
## 17 2008 -0.18118461 US_5yr_SPEI
## 18 2009 -0.02593685 US_5yr_SPEI
## 19 2010 -0.05834806 US_5yr_SPEI
## 20 2011 0.30089741 US_5yr_SPEI
## 21 2012 0.02220695 US_5yr_SPEI
## 22 2013 -0.08733837 US_5yr_SPEI
## 23 2014 -0.06269047 US_5yr_SPEI
## 24 2015 -0.17811563 US_5yr_SPEI
## 25 2016 -0.20807585 US_5yr_SPEI
## 26 2017 0.17133957 US_5yr_SPEI
## 27 2018 0.77183745 US_5yr_SPEI
## 28 2019 1.05438416 US_5yr_SPEI
## 29 2020 1.03854978 US_5yr_SPEI
## 30 2021 NA US_5yr_SPEI
## 31 1992 -1.19000000 CA_5yr_SPEI
## 32 1993 -0.52000000 CA_5yr_SPEI
## 33 1994 -0.81000000 CA_5yr_SPEI
## 34 1995 0.28000000 CA_5yr_SPEI
## 35 1996 0.36000000 CA_5yr_SPEI
## 36 1997 0.60000000 CA_5yr_SPEI
## 37 1998 0.96000000 CA_5yr_SPEI
## 38 1999 1.25000000 CA_5yr_SPEI
## 39 2000 0.67000000 CA_5yr_SPEI
## 40 2001 0.33000000 CA_5yr_SPEI
## 41 2002 -0.05000000 CA_5yr_SPEI
## 42 2003 -0.98000000 CA_5yr_SPEI
## 43 2004 -1.04000000 CA_5yr_SPEI
## 44 2005 -0.73000000 CA_5yr_SPEI
## 45 2006 -0.16000000 CA_5yr_SPEI
## 46 2007 -0.26000000 CA_5yr_SPEI
## 47 2008 -0.53000000 CA_5yr_SPEI
## 48 2009 -0.74000000 CA_5yr_SPEI
## 49 2010 -0.80000000 CA_5yr_SPEI
## 50 2011 -0.60000000 CA_5yr_SPEI
## 51 2012 -0.56000000 CA_5yr_SPEI
## 52 2013 -0.61000000 CA_5yr_SPEI
## 53 2014 -1.03000000 CA_5yr_SPEI
## 54 2015 -1.65000000 CA_5yr_SPEI
## 55 2016 -1.93000000 CA_5yr_SPEI
## 56 2017 -1.38000000 CA_5yr_SPEI
## 57 2018 -1.45000000 CA_5yr_SPEI
## 58 2019 -0.61000000 CA_5yr_SPEI
## 59 2020 -0.61000000 CA_5yr_SPEI
## 60 2021 -1.10000000 CA_5yr_SPEI
# convert to factor
combined_SPEI_group %<>%
mutate_at("Location", factor)
glimpse(combined_SPEI_group)
## Rows: 60
## Columns: 3
## $ Year <fct> 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2…
## $ SPEI <dbl> -0.29736869, 0.32903045, 0.44245068, 0.74043491, 0.71709022, …
## $ Location <fct> US_5yr_SPEI, US_5yr_SPEI, US_5yr_SPEI, US_5yr_SPEI, US_5yr_SP…
ggp <- ggplot(combined_SPEI_group, aes(x = Year, y = SPEI, col = Location, group = Location)) +
geom_line()
ggp
ggp <- ggplot(combined_SPEI_group, aes(Year, SPEI, fill = Location)) + geom_bar(stat = "identity",
position = "dodge") + labs(x = "YEAR", y = "SPEI", title = "Average Five-year SPEI by Year")
ggp
# Draw plot in different panels ggp + facet_grid(Group ~ .)
summary(fires_new) #discovered DAY_TO_CONT outliers and then found containment dates in the future so went back to transformation step to replace them with NA
## FOD_ID FIRE_YEAR DISCOVERY_DATE DISCOVERY_DOY
## Length:2120440 2006 : 114321 Min. :1992-01-01 185 : 15613
## Class :character 2000 : 96020 1st Qu.:1999-08-04 186 : 13499
## Mode :character 2011 : 95302 Median :2006-03-04 100 : 10503
## 2007 : 93666 Mean :2005-10-05 101 : 10470
## 1999 : 88851 3rd Qu.:2011-12-18 108 : 10440
## 2008 : 87150 Max. :2018-12-31 83 : 10208
## (Other):1545130 (Other):2049707
## DISCOVERY_TIME NWCG_CAUSE_CLASSIFICATION
## Min. :00:00:00 Human :1670177
## 1st Qu.:12:34:00 Missing data/not specified/undetermined: 142132
## Median :14:55:00 Natural : 308131
## Mean :14:36:34
## 3rd Qu.:17:10:00
## Max. :23:59:00
## NA's :714898
## NWCG_GENERAL_CAUSE CONT_DATE
## Missing data/not specified/undetermined:513433 Min. :1992-01-01
## Debris and open burning :504672 1st Qu.:2000-06-08
## Arson/incendiarism :309760 Median :2007-04-20
## Natural :308131 Mean :2006-09-09
## Equipment and vehicle use :175330 3rd Qu.:2013-07-16
## Recreation and ceremony : 90826 Max. :2021-07-24
## (Other) :218288 NA's :818599
## CONT_DOY CONT_TIME FIRE_SIZE FIRE_SIZE_CLASS
## 185 : 9587 Min. :00:00:00 Min. : 0.0 A: 794755
## 186 : 9061 1st Qu.:13:05:00 1st Qu.: 0.1 B:1023919
## 184 : 6726 Median :15:55:00 Median : 1.0 C: 242174
## 187 : 6594 Mean :15:23:35 Mean : 61.5 D: 31636
## 108 : 6478 3rd Qu.:18:10:00 3rd Qu.: 3.0 E: 15653
## (Other):1263434 Max. :23:59:00 Max. :662700.0 F: 8561
## NA's : 818560 NA's :893634 G: 3742
## LATITUDE LONGITUDE STATE
## Min. :24.58 Min. :-124.72 CA : 235229
## 1st Qu.:33.07 1st Qu.:-110.45 GA : 180175
## Median :35.69 Median : -93.09 TX : 167061
## Mean :36.98 Mean : -95.86 NC : 123793
## 3rd Qu.:40.78 3rd Qu.: -82.55 FL : 99356
## Max. :49.34 Max. : -66.97 AZ : 93417
## (Other):1221409
## COUNTY REGION DAYS_TO_CONT
## Riverside County : 14989 East:1130159 Min. : 0.0000
## Maricopa County : 12984 West: 990281 1st Qu.: 0.0000
## Lincoln County : 11852 Median : 0.0000
## Washington County: 11815 Mean : 0.5043
## Jackson County : 11458 3rd Qu.: 0.0000
## (Other) :1435726 Max. :150.0000
## NA's : 621616
summary(combined_SPEI)
## Year US_5yr_SPEI CA_5yr_SPEI
## 1992 : 1 Min. :-0.71237 Min. :-1.9300
## 1993 : 1 1st Qu.:-0.18118 1st Qu.:-1.0175
## 1994 : 1 Median : 0.02221 Median :-0.6100
## 1995 : 1 Mean : 0.19359 Mean :-0.4963
## 1996 : 1 3rd Qu.: 0.71709 3rd Qu.:-0.0775
## 1997 : 1 Max. : 1.06105 Max. : 1.2500
## (Other):24 NA's :1
summary(combined_SPEI_group)
## Year SPEI Location
## 1992 : 2 Min. :-1.9300 CA_5yr_SPEI:30
## 1993 : 2 1st Qu.:-0.6100 US_5yr_SPEI:30
## 1994 : 2 Median :-0.1781
## 1995 : 2 Mean :-0.1572
## 1996 : 2 3rd Qu.: 0.3450
## 1997 : 2 Max. : 1.2500
## (Other):48 NA's :1
attach(fires_new)
fires_new_num <- cbind(FIRE_SIZE, LATITUDE, LONGITUDE, DAYS_TO_CONT)
options(scipen = 100)
options(digits = 2)
stat.desc(fires_new_num)
## FIRE_SIZE LATITUDE LONGITUDE DAYS_TO_CONT
## nbr.val 2120440.00000 2120440.0000 2120440.000 2120440.0000
## nbr.null 0.00000 0.0000 0.000 1870956.0000
## nbr.na 0.00000 0.0000 0.000 0.0000
## min 0.00001 24.5817 -124.719 0.0000
## max 662700.00000 49.3434 -66.971 150.0000
## range 662699.99999 24.7617 57.748 150.0000
## sum 130409712.66672 78415454.4931 -203270757.872 1069271.0000
## median 0.97000 35.6909 -93.088 0.0000
## mean 61.50125 36.9807 -95.863 0.5043
## SE.mean 1.40406 0.0036 0.011 0.0028
## CI.mean.0.95 2.75192 0.0071 0.021 0.0054
## var 4180231.50750 27.6174 237.263 16.3207
## std.dev 2044.56145 5.2552 15.403 4.0399
## coef.var 33.24423 0.1421 -0.161 8.0114
attach(combined_SPEI)
combined_SPEI_num <- cbind(US_5yr_SPEI, CA_5yr_SPEI)
stat.desc(combined_SPEI_num)
## US_5yr_SPEI CA_5yr_SPEI
## nbr.val 29.000 30.00
## nbr.null 0.000 0.00
## nbr.na 1.000 0.00
## min -0.712 -1.93
## max 1.061 1.25
## range 1.773 3.18
## sum 5.614 -14.89
## median 0.022 -0.61
## mean 0.194 -0.50
## SE.mean 0.097 0.14
## CI.mean.0.95 0.199 0.29
## var 0.273 0.59
## std.dev 0.522 0.77
## coef.var 2.697 -1.55
# Histograms
# https://stackoverflow.com/questions/11610377/how-do-i-change-the-formatting-of-numbers-on-an-axis-with-ggplot
fancy_scientific <- function(l) {
# turn in to character string in scientific notation
l <- format(l, scientific = TRUE)
# quote the part before the exponent to keep all the digits
l <- gsub("^(.*)e", "'\\1'e", l)
# remove + after exponent, if exists. E.g.: (3x10^+2 -> 3x10^2)
l <- gsub("e\\+", "e", l)
# turn the 'e+' into plotmath format
l <- gsub("e", "%*%10^", l)
# convert 1x10^ or 1.000x10^ -> 10^
l <- gsub("\\'1[\\.0]*\\'\\%\\*\\%", "", l)
# return this as an expression
parse(text = l)
}
# us fire size
ggplot(data = fires_new, aes(x = FIRE_SIZE)) + geom_histogram(bins = 100) + scale_x_log10(breaks = trans_breaks("log10",
function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + scale_y_continuous(labels = fancy_scientific) +
ggtitle("US Fire Size in Acres") + ylab("FREQUENCY") + theme(title = element_text(size = 20),
axis.title.x = element_text(size = 16), axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14), axis.title.y = element_text(size = 16))
# ca fire size
ggplot(data = fires_new_ca, aes(x = FIRE_SIZE)) + geom_histogram(bins = 100) + scale_x_log10(breaks = trans_breaks("log10",
function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + scale_y_continuous(labels = fancy_scientific) +
ggtitle("CA Fire Size in Acres") + ylab("FREQUENCY") + theme(title = element_text(size = 20),
axis.title.x = element_text(size = 16), axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14), axis.title.y = element_text(size = 16))
# west fire size
ggplot(data = fires_new_west, aes(x = FIRE_SIZE)) + geom_histogram(bins = 100) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10",
math_format(10^.x))) + scale_y_continuous(labels = fancy_scientific) + ggtitle("West Fire Size in Acres") +
ylab("FREQUENCY") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
# east fire size
ggplot(data = fires_new_east, aes(x = FIRE_SIZE)) + geom_histogram(bins = 100) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10",
math_format(10^.x))) + scale_y_continuous(labels = fancy_scientific) + ggtitle("East Fire Size in Acres") +
ylab("FREQUENCY") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
# us days to containment
ggplot(data = fires_new, aes(x = DAYS_TO_CONT)) + geom_histogram(bins = 10) + scale_x_log10(breaks = trans_breaks("log10",
function(x) 10^x), labels = trans_format("log10", math_format(10^.x))) + scale_y_continuous(labels = fancy_scientific) +
ggtitle("US Days To Containment") + ylab("FREQUENCY") + theme(title = element_text(size = 20),
axis.title.x = element_text(size = 16), axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14), axis.title.y = element_text(size = 16))
# ca days to containment
ggplot(data = fires_new_ca, aes(x = DAYS_TO_CONT)) + geom_histogram(bins = 10) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10",
math_format(10^.x))) + scale_y_continuous(labels = fancy_scientific) + ggtitle("CA Days To Containment") +
ylab("FREQUENCY") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
# west days to containment
ggplot(data = fires_new_west, aes(x = DAYS_TO_CONT)) + geom_histogram(bins = 10) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10",
math_format(10^.x))) + scale_y_continuous(labels = fancy_scientific) + ggtitle("West Days To Containment") +
ylab("FREQUENCY") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
# East days to containment
ggplot(data = fires_new_east, aes(x = DAYS_TO_CONT)) + geom_histogram(bins = 10) +
scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10",
math_format(10^.x))) + scale_y_continuous(labels = fancy_scientific) + ggtitle("East Days To Containment") +
ylab("FREQUENCY") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
# Boxplots Fire Size, Days to Containment and SPEI, US and CA
# x labels with the number of obs for each group US
fire_year_new_xlab <- paste(levels(fires_new$FIRE_YEAR), "\n(N=", table(fires_new$FIRE_YEAR),
")", sep = "")
# CA
fire_year_new_ca_xlab <- paste(levels(fires_new_ca$FIRE_YEAR), "\n(N=", table(fires_new_ca$FIRE_YEAR),
")", sep = "")
# US fireszie
box_violin_new_fs <- ggplot(data = fires_new, aes(x = FIRE_YEAR, y = FIRE_SIZE)) +
stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(varwidth = TRUE,
alpha = 1, outlier.shape = 1, outlier.alpha = 0.5) + geom_violin(width = 1.4,
alpha = 0.1) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) + stat_summary(fun = mean,
geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = fire_year_new_xlab) +
ggtitle("US Fire Size in Acres by Year") + theme(legend.position = "none", plot.title = element_text(size = 22))
box_violin_new_fs
# CA firesize
box_violin_new_ca_fs <- ggplot(data = fires_new_ca, aes(x = FIRE_YEAR, y = FIRE_SIZE)) +
stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(varwidth = TRUE,
alpha = 1, outlier.shape = 1, outlier.alpha = 0.5) + geom_violin(width = 1.4,
alpha = 0.1) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) + stat_summary(fun = mean,
geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = fire_year_new_ca_xlab) +
ggtitle("CA Fire Size in Acres by Year") + theme(legend.position = "none", plot.title = element_text(size = 22))
box_violin_new_ca_fs
# US days to cont
box_violin_new_dtc <- ggplot(data = fires_new, aes(x = FIRE_YEAR, y = DAYS_TO_CONT)) +
stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(varwidth = TRUE,
alpha = 1, outlier.shape = 1, outlier.alpha = 0.5) + geom_violin(width = 1.4,
alpha = 0.1) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) + stat_summary(fun = mean,
geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = fire_year_new_xlab) +
ggtitle("US Days to Containment by Year") + theme(legend.position = "none", plot.title = element_text(size = 22))
box_violin_new_dtc
# CA days to cont
box_violin_new_ca_dtc <- ggplot(data = fires_new_ca, aes(x = FIRE_YEAR, y = DAYS_TO_CONT)) +
stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(varwidth = TRUE,
alpha = 1, outlier.shape = 1, outlier.alpha = 0.5) + geom_violin(width = 1.4,
alpha = 0.1) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) + stat_summary(fun = mean,
geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = fire_year_new_ca_xlab) +
ggtitle("CA Days to Containment by Year") + theme(legend.position = "none", plot.title = element_text(size = 22))
box_violin_new_ca_dtc
# East vs West
# x labels with the number of obs for each group West
fire_year_west_xlab <- paste(levels(fires_new_west$FIRE_YEAR), "\n(N=", table(fires_new_west$FIRE_YEAR),
")", sep = "")
# East
fire_year_east_xlab <- paste(levels(fires_new_east$FIRE_YEAR), "\n(N=", table(fires_new_east$FIRE_YEAR),
")", sep = "")
# West fireszie
box_violin_west_fs <- ggplot(data = fires_new_west, aes(x = FIRE_YEAR, y = FIRE_SIZE)) +
stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(varwidth = TRUE,
alpha = 1, outlier.shape = 1, outlier.alpha = 0.5) + geom_violin(width = 1.4,
alpha = 0.1) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) + stat_summary(fun = mean,
geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = fire_year_west_xlab) +
ggtitle("West Fire Size in Acres by Year") + theme(legend.position = "none",
plot.title = element_text(size = 22))
box_violin_west_fs
# East firesize
box_violin_east_fs <- ggplot(data = fires_new_east, aes(x = FIRE_YEAR, y = FIRE_SIZE)) +
stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(varwidth = TRUE,
alpha = 1, outlier.shape = 1, outlier.alpha = 0.5) + geom_violin(width = 1.4,
alpha = 0.1) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) + stat_summary(fun = mean,
geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = fire_year_east_xlab) +
ggtitle("East Fire Size in Acres by Year") + theme(legend.position = "none",
plot.title = element_text(size = 22))
box_violin_east_fs
# West days to cont
box_violin_west_dtc <- ggplot(data = fires_new_west, aes(x = FIRE_YEAR, y = DAYS_TO_CONT)) +
stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(varwidth = TRUE,
alpha = 1, outlier.shape = 1, outlier.alpha = 0.5) + geom_violin(width = 1.4,
alpha = 0.1) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) + stat_summary(fun = mean,
geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = fire_year_west_xlab) +
ggtitle("West Days to Containment by Year") + theme(legend.position = "none",
plot.title = element_text(size = 22))
box_violin_west_dtc
# East days to cont
box_violin_east_dtc <- ggplot(data = fires_new_east, aes(x = FIRE_YEAR, y = DAYS_TO_CONT)) +
stat_boxplot(geom = "errorbar", width = 0.3) + geom_boxplot(varwidth = TRUE,
alpha = 1, outlier.shape = 1, outlier.alpha = 0.5) + geom_violin(width = 1.4,
alpha = 0.1) + scale_y_log10(breaks = trans_breaks("log10", function(x) 10^x),
labels = trans_format("log10", math_format(10^.x))) + stat_summary(fun = mean,
geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = fire_year_east_xlab) +
ggtitle("East Days to Containment by Year") + theme(legend.position = "none",
plot.title = element_text(size = 22))
box_violin_east_dtc
# x labels with the number of obs for each group SPEI
location_spei_xlab <- paste(levels(combined_SPEI_group$Location), "\n(N=", table(combined_SPEI_group$Location),
")", sep = "")
# US & CA SPEI
box_SPEI_group <- ggplot(combined_SPEI_group, aes(Location, SPEI)) + stat_boxplot(geom = "errorbar",
width = 0.2) + geom_boxplot(notch = TRUE, varwidth = TRUE, outlier.shape = NA) +
stat_summary(fun = mean, geom = "point", shape = 23, size = 3) + scale_x_discrete(labels = location_spei_xlab) +
geom_jitter(width = 0.2) + ggtitle("SPEI by Location") + theme(legend.position = "none",
plot.title = element_text(size = 22))
box_SPEI_group
## Visualizations
glimpse(fires_new)
## Rows: 2,120,440
## Columns: 18
## $ FOD_ID <chr> "1", "2", "3", "4", "5", "6", "7", "8", "9",…
## $ FIRE_YEAR <fct> 2005, 2004, 2004, 2004, 2004, 2004, 2004, 20…
## $ DISCOVERY_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-06…
## $ DISCOVERY_DOY <fct> 33, 133, 152, 180, 180, 182, 183, 67, 74, 18…
## $ DISCOVERY_TIME <times> 13:00:00, 08:45:00, 19:21:00, 16:00:00, 16…
## $ NWCG_CAUSE_CLASSIFICATION <fct> Human, Natural, Human, Natural, Natural, Nat…
## $ NWCG_GENERAL_CAUSE <fct> Power generation/transmission/distribution, …
## $ CONT_DATE <date> 2005-02-02, 2004-05-12, 2004-05-31, 2004-07…
## $ CONT_DOY <fct> 33, 133, 152, 185, 185, 183, 184, 67, 74, 18…
## $ CONT_TIME <times> 17:30:00, 15:30:00, 20:24:00, 14:00:00, 12…
## $ FIRE_SIZE <dbl> 0.10, 0.25, 0.10, 0.10, 0.10, 0.10, 0.10, 0.…
## $ FIRE_SIZE_CLASS <fct> A, A, A, A, A, A, A, B, B, A, A, A, A, B, A,…
## $ LATITUDE <dbl> 40, 39, 39, 39, 39, 39, 39, 41, 41, 39, 39, …
## $ LONGITUDE <dbl> -121, -120, -121, -120, -120, -120, -120, -1…
## $ STATE <fct> CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, CA, …
## $ COUNTY <fct> Plumas County, Placer County, El Dorado Coun…
## $ REGION <fct> West, West, West, West, West, West, West, We…
## $ DAYS_TO_CONT <dbl> 0, 0, 0, 5, 5, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0,…
# Fires over time
fires_new %>%
group_by(FIRE_YEAR) %>%
summarize(n_fires = n()) %>%
ggplot(aes(x = FIRE_YEAR, y = n_fires/1000)) + geom_bar(stat = "identity", fill = "grey") +
labs(x = "FIRE_YEAR", y = "Number of wildfires (thousands)", title = "US Wildfires by Year") +
theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
fires_new_ca %>%
group_by(FIRE_YEAR) %>%
summarize(n_fires = n()) %>%
ggplot(aes(x = FIRE_YEAR, y = n_fires/1000)) + geom_bar(stat = "identity", fill = "grey") +
labs(x = "FIRE_YEAR", y = "Number of wildfires (thousands)", title = "CA Wildfires by Year") +
theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
fires_new_west %>%
group_by(FIRE_YEAR) %>%
summarize(n_fires = n()) %>%
ggplot(aes(x = FIRE_YEAR, y = n_fires/1000)) + geom_bar(stat = "identity", fill = "grey") +
labs(x = "FIRE_YEAR", y = "Number of wildfires (thousands)", title = "Western States Wildfires by Year") +
theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
fires_new_east %>%
group_by(FIRE_YEAR) %>%
summarize(n_fires = n()) %>%
ggplot(aes(x = FIRE_YEAR, y = n_fires/1000)) + geom_bar(stat = "identity", fill = "grey") +
labs(x = "FIRE_YEAR", y = "Number of wildfires (thousands)", title = "Eastern States Wildfires by Year") +
theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14),
axis.title.y = element_text(size = 16))
# Fires Causes
# US Fires by cause
fires_new %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(n_fires = n()/1000) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, n_fires), y = n_fires)) + geom_bar(stat = "identity",
fill = "grey") + coord_flip() + labs(x = "", y = "Number of fires (thousands)",
title = "US Fires by Cause") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
# CA Fires by cause
fires_new_ca %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(n_fires = n()/1000) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, n_fires), y = n_fires)) + geom_bar(stat = "identity",
fill = "grey") + coord_flip() + labs(x = "", y = "Number of fires (thousands)",
title = "CA Fires by Cause") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
# West Fires by cause
fires_new_west %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(n_fires = n()/1000) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, n_fires), y = n_fires)) + geom_bar(stat = "identity",
fill = "grey") + coord_flip() + labs(x = "", y = "Number of fires (thousands)",
title = "West Fires by Cause") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
# East Fires by cause
fires_new_east %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(n_fires = n()/1000) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, n_fires), y = n_fires)) + geom_bar(stat = "identity",
fill = "grey") + coord_flip() + labs(x = "", y = "Number of fires (thousands)",
title = "East Fires by Cause") + theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
# US Fire Size by cause
fires_new %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(mean_size = mean(FIRE_SIZE, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, mean_size), y = mean_size)) + geom_bar(stat = "identity",
fill = "grey") + coord_flip() + labs(x = "", y = "Acres", title = "US Avg Fire Size by Cause") +
theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
# CA Fire Size by cause
fires_new_ca %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(mean_size = mean(FIRE_SIZE, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, mean_size), y = mean_size)) + geom_bar(stat = "identity",
fill = "grey") + coord_flip() + labs(x = "", y = "Acres", title = "CA Avg Fire Size by Cause") +
theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
# West Fire Size by cause
fires_new_west %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(mean_size = mean(FIRE_SIZE, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, mean_size), y = mean_size)) + geom_bar(stat = "identity",
fill = "grey") + coord_flip() + labs(x = "", y = "Acres", title = "West Avg Fire Size by Cause") +
theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
# East Fire Size by cause
fires_new_east %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(mean_size = mean(FIRE_SIZE, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, mean_size), y = mean_size)) + geom_bar(stat = "identity",
fill = "grey") + coord_flip() + labs(x = "", y = "Acres", title = "East Avg Fire Size by Cause") +
theme(title = element_text(size = 20), axis.title.x = element_text(size = 16),
axis.text.x = element_text(size = 14), axis.text.y = element_text(size = 14))
# US Days to cont by cause
fires_new %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(mean_days_to_cont = mean(DAYS_TO_CONT, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, mean_days_to_cont), y = mean_days_to_cont)) +
geom_bar(stat = "identity", fill = "grey") + coord_flip() + labs(x = "", y = "Days",
title = "US Avg Days to Containment by Cause") + theme(title = element_text(size = 20),
axis.title.x = element_text(size = 16), axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14))
# CA Days to cont by cause
fires_new_ca %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(mean_days_to_cont = mean(DAYS_TO_CONT, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, mean_days_to_cont), y = mean_days_to_cont)) +
geom_bar(stat = "identity", fill = "grey") + coord_flip() + labs(x = "", y = "Days",
title = "CA Avg Days to Containment by Cause") + theme(title = element_text(size = 20),
axis.title.x = element_text(size = 16), axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14))
# West Days to cont by cause
fires_new_west %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(mean_days_to_cont = mean(DAYS_TO_CONT, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, mean_days_to_cont), y = mean_days_to_cont)) +
geom_bar(stat = "identity", fill = "grey") + coord_flip() + labs(x = "", y = "Days",
title = "West Avg Days to Containment by Cause") + theme(title = element_text(size = 20),
axis.title.x = element_text(size = 16), axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14))
# East Days to cont by cause
fires_new_east %>%
group_by(NWCG_GENERAL_CAUSE) %>%
summarize(mean_days_to_cont = mean(DAYS_TO_CONT, na.rm = TRUE)) %>%
ggplot(aes(x = reorder(NWCG_GENERAL_CAUSE, mean_days_to_cont), y = mean_days_to_cont)) +
geom_bar(stat = "identity", fill = "grey") + coord_flip() + labs(x = "", y = "Days",
title = "East Avg Days to Containment by Cause") + theme(title = element_text(size = 20),
axis.title.x = element_text(size = 16), axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14))
## Correlation Matrix Numeric Attributes
# US fires
data(fires_new, package = "mosaicData")
df <- dplyr::select_if(fires_new, is.numeric)
r <- cor(df, use = "complete.obs")
round(r, 2)
## DISCOVERY_TIME CONT_TIME FIRE_SIZE LATITUDE LONGITUDE
## DISCOVERY_TIME 1.00 0.59 0.00 0.00 0.06
## CONT_TIME 0.59 1.00 0.00 -0.03 0.03
## FIRE_SIZE 0.00 0.00 1.00 0.01 -0.03
## LATITUDE 0.00 -0.03 0.01 1.00 -0.29
## LONGITUDE 0.06 0.03 -0.03 -0.29 1.00
## DAYS_TO_CONT 0.00 -0.07 0.16 0.10 -0.11
## DAYS_TO_CONT
## DISCOVERY_TIME 0.00
## CONT_TIME -0.07
## FIRE_SIZE 0.16
## LATITUDE 0.10
## LONGITUDE -0.11
## DAYS_TO_CONT 1.00
ggcorrplot(r, hc.order = TRUE, type = "lower", lab = TRUE)
# CA fires
data(fires_new_ca, package = "mosaicData")
df <- dplyr::select_if(fires_new_ca, is.numeric)
r <- cor(df, use = "complete.obs")
round(r, 2)
## DISCOVERY_TIME CONT_TIME FIRE_SIZE LATITUDE LONGITUDE
## DISCOVERY_TIME 1.00 0.65 0.00 0.00 0.00
## CONT_TIME 0.65 1.00 0.01 0.02 0.01
## FIRE_SIZE 0.00 0.01 1.00 0.00 0.00
## LATITUDE 0.00 0.02 0.00 1.00 -0.88
## LONGITUDE 0.00 0.01 0.00 -0.88 1.00
## DAYS_TO_CONT 0.00 -0.02 0.22 0.05 -0.02
## DAYS_TO_CONT
## DISCOVERY_TIME 0.00
## CONT_TIME -0.02
## FIRE_SIZE 0.22
## LATITUDE 0.05
## LONGITUDE -0.02
## DAYS_TO_CONT 1.00
ggcorrplot(r, hc.order = TRUE, type = "lower", lab = TRUE)
# West fires
data(fires_new_west, package = "mosaicData")
df <- dplyr::select_if(fires_new_west, is.numeric)
r <- cor(df, use = "complete.obs")
round(r, 2)
## DISCOVERY_TIME CONT_TIME FIRE_SIZE LATITUDE LONGITUDE
## DISCOVERY_TIME 1.00 0.53 0.00 0.00 0.07
## CONT_TIME 0.53 1.00 0.01 -0.02 0.07
## FIRE_SIZE 0.00 0.01 1.00 0.01 -0.01
## LATITUDE 0.00 -0.02 0.01 1.00 -0.25
## LONGITUDE 0.07 0.07 -0.01 -0.25 1.00
## DAYS_TO_CONT 0.01 -0.07 0.16 0.11 -0.08
## DAYS_TO_CONT
## DISCOVERY_TIME 0.01
## CONT_TIME -0.07
## FIRE_SIZE 0.16
## LATITUDE 0.11
## LONGITUDE -0.08
## DAYS_TO_CONT 1.00
ggcorrplot(r, hc.order = TRUE, type = "lower", lab = TRUE)
# East fires
data(fires_new_east, package = "mosaicData")
df <- dplyr::select_if(fires_new_east, is.numeric)
r <- cor(df, use = "complete.obs")
round(r, 2)
## DISCOVERY_TIME CONT_TIME FIRE_SIZE LATITUDE LONGITUDE
## DISCOVERY_TIME 1.00 0.70 0.00 0.03 0.00
## CONT_TIME 0.70 1.00 0.00 -0.04 -0.09
## FIRE_SIZE 0.00 0.00 1.00 -0.02 0.00
## LATITUDE 0.03 -0.04 -0.02 1.00 0.16
## LONGITUDE 0.00 -0.09 0.00 0.16 1.00
## DAYS_TO_CONT 0.01 -0.09 0.14 0.02 0.06
## DAYS_TO_CONT
## DISCOVERY_TIME 0.01
## CONT_TIME -0.09
## FIRE_SIZE 0.14
## LATITUDE 0.02
## LONGITUDE 0.06
## DAYS_TO_CONT 1.00
ggcorrplot(r, hc.order = TRUE, type = "lower", lab = TRUE)
## Fire size and SPEI Linear Regression
# CA FIRE_SIZE & SPEI new df with fire_year and mean fire size for CA
fires_new_ca_fs <- fires_new_ca %>%
group_by(FIRE_YEAR) %>%
summarise_at(vars(FIRE_SIZE), list(AVG_FIRE_SIZE_CA = mean))
fires_new_ca_fs
## # A tibble: 27 × 2
## FIRE_YEAR AVG_FIRE_SIZE_CA
## <fct> <dbl>
## 1 1992 27.4
## 2 1993 38.9
## 3 1994 47.1
## 4 1995 29.3
## 5 1996 77.1
## 6 1997 40.9
## 7 1998 23.3
## 8 1999 90.3
## 9 2000 38.0
## 10 2001 42.9
## # … with 17 more rows
# rename fire_year for merging
fires_new_ca_fs <- rename_all(fires_new_ca_fs, recode, FIRE_YEAR = "Year")
fires_new_ca_fs
## # A tibble: 27 × 2
## Year AVG_FIRE_SIZE_CA
## <fct> <dbl>
## 1 1992 27.4
## 2 1993 38.9
## 3 1994 47.1
## 4 1995 29.3
## 5 1996 77.1
## 6 1997 40.9
## 7 1998 23.3
## 8 1999 90.3
## 9 2000 38.0
## 10 2001 42.9
## # … with 17 more rows
# merge with Ca-spei
ca_fs_spei <- merge(fires_new_ca_fs, ca_SPEI, by = "Year", all = TRUE)
ca_fs_spei
## Year AVG_FIRE_SIZE_CA CA_5yr_SPEI
## 1 1992 27 -1.19
## 2 1993 39 -0.52
## 3 1994 47 -0.81
## 4 1995 29 0.28
## 5 1996 77 0.36
## 6 1997 41 0.60
## 7 1998 23 0.96
## 8 1999 90 1.25
## 9 2000 38 0.67
## 10 2001 43 0.33
## 11 2002 72 -0.05
## 12 2003 131 -0.98
## 13 2004 41 -1.04
## 14 2005 26 -0.73
## 15 2006 68 -0.16
## 16 2007 79 -0.26
## 17 2008 135 -0.53
## 18 2009 47 -0.74
## 19 2010 15 -0.80
## 20 2011 22 -0.60
## 21 2012 106 -0.56
## 22 2013 67 -0.61
## 23 2014 84 -1.03
## 24 2015 115 -1.65
## 25 2016 73 -1.93
## 26 2017 143 -1.38
## 27 2018 172 -1.45
## 28 2019 NA -0.61
## 29 2020 NA -0.61
## 30 2021 NA -1.10
# linear regression
fit1 <- lm(AVG_FIRE_SIZE_CA ~ CA_5yr_SPEI, data = ca_fs_spei)
summary(fit1)
##
## Call:
## lm(formula = AVG_FIRE_SIZE_CA ~ CA_5yr_SPEI, data = ca_fs_spei)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.49 -27.31 -7.73 24.24 85.33
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59.91 8.97 6.68 0.00000053 ***
## CA_5yr_SPEI -18.74 9.79 -1.91 0.067 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 40 on 25 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.128, Adjusted R-squared: 0.0928
## F-statistic: 3.66 on 1 and 25 DF, p-value: 0.0672
# Linear regression func for values to show on visual
ggplotRegression <- function(fit) {
require(ggplot2)
ggplot(fit$model, aes_string(x = names(fit$model)[2], y = names(fit$model)[1])) +
geom_point() + stat_smooth(method = "lm", col = "red") + labs(title = paste("Adj R2 = ",
signif(summary(fit)$adj.r.squared, 5), "Intercept =", signif(fit$coef[[1]],
5), " Slope =", signif(fit$coef[[2]], 5), " P =", signif(summary(fit)$coef[2,
4], 5)))
}
ggplotRegression(fit1)
# normalize FIRE_SIZE_CA
ca_fs_spei_norm <- ca_fs_spei %>%
mutate_at(c("AVG_FIRE_SIZE_CA"), ~(scale(.) %>%
as.vector))
ca_fs_spei_norm
## Year AVG_FIRE_SIZE_CA CA_5yr_SPEI
## 1 1992 -0.979 -1.19
## 2 1993 -0.706 -0.52
## 3 1994 -0.512 -0.81
## 4 1995 -0.934 0.28
## 5 1996 0.201 0.36
## 6 1997 -0.657 0.60
## 7 1998 -1.075 0.96
## 8 1999 0.513 1.25
## 9 2000 -0.727 0.67
## 10 2001 -0.611 0.33
## 11 2002 0.091 -0.05
## 12 2003 1.478 -0.98
## 13 2004 -0.665 -1.04
## 14 2005 -1.016 -0.73
## 15 2006 -0.014 -0.16
## 16 2007 0.247 -0.26
## 17 2008 1.583 -0.53
## 18 2009 -0.509 -0.74
## 19 2010 -1.263 -0.80
## 20 2011 -1.096 -0.60
## 21 2012 0.893 -0.56
## 22 2013 -0.047 -0.61
## 23 2014 0.374 -1.03
## 24 2015 1.109 -1.65
## 25 2016 0.102 -1.93
## 26 2017 1.756 -1.38
## 27 2018 2.463 -1.45
## 28 2019 NA -0.61
## 29 2020 NA -0.61
## 30 2021 NA -1.10
fit2 <- lm(AVG_FIRE_SIZE_CA ~ CA_5yr_SPEI, data = ca_fs_spei_norm)
summary(fit2)
##
## Call:
## lm(formula = AVG_FIRE_SIZE_CA ~ CA_5yr_SPEI, data = ca_fs_spei_norm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.412 -0.648 -0.183 0.575 2.025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.207 0.213 -0.97 0.340
## CA_5yr_SPEI -0.445 0.232 -1.91 0.067 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.95 on 25 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.128, Adjusted R-squared: 0.0928
## F-statistic: 3.66 on 1 and 25 DF, p-value: 0.0672
ggplotRegression(fit2)
# normalization gives same result
# CA FIRE_FREQ & SPEI new df with fire_year and fire frequency for CA
fires_new_ca_ff <- fires_new_ca %>%
group_by(FIRE_YEAR) %>%
summarise(CA_FIRE_FREQ = (count = n()))
fires_new_ca_ff
## # A tibble: 27 × 2
## FIRE_YEAR CA_FIRE_FREQ
## <fct> <int>
## 1 1992 10831
## 2 1993 8268
## 3 1994 8649
## 4 1995 7381
## 5 1996 9170
## 6 1997 7928
## 7 1998 6861
## 8 1999 8909
## 9 2000 6970
## 10 2001 8163
## # … with 17 more rows
# rename fire_year for merging
fires_new_ca_ff <- rename_all(fires_new_ca_ff, recode, FIRE_YEAR = "Year")
fires_new_ca_ff
## # A tibble: 27 × 2
## Year CA_FIRE_FREQ
## <fct> <int>
## 1 1992 10831
## 2 1993 8268
## 3 1994 8649
## 4 1995 7381
## 5 1996 9170
## 6 1997 7928
## 7 1998 6861
## 8 1999 8909
## 9 2000 6970
## 10 2001 8163
## # … with 17 more rows
# merge with Ca-spei
ca_ff_spei <- merge(fires_new_ca_ff, ca_SPEI, by = "Year", all = TRUE)
ca_ff_spei
## Year CA_FIRE_FREQ CA_5yr_SPEI
## 1 1992 10831 -1.19
## 2 1993 8268 -0.52
## 3 1994 8649 -0.81
## 4 1995 7381 0.28
## 5 1996 9170 0.36
## 6 1997 7928 0.60
## 7 1998 6861 0.96
## 8 1999 8909 1.25
## 9 2000 6970 0.67
## 10 2001 8163 0.33
## 11 2002 7118 -0.05
## 12 2003 7894 -0.98
## 13 2004 7407 -1.04
## 14 2005 10219 -0.73
## 15 2006 12038 -0.16
## 16 2007 13428 -0.26
## 17 2008 10594 -0.53
## 18 2009 9862 -0.74
## 19 2010 8304 -0.80
## 20 2011 8560 -0.60
## 21 2012 7198 -0.56
## 22 2013 8722 -0.61
## 23 2014 6493 -1.03
## 24 2015 7355 -1.65
## 25 2016 7882 -1.93
## 26 2017 9537 -1.38
## 27 2018 9488 -1.45
## 28 2019 NA -0.61
## 29 2020 NA -0.61
## 30 2021 NA -1.10
# linear regression
fit3 <- lm(CA_FIRE_FREQ ~ CA_5yr_SPEI, data = ca_ff_spei)
summary(fit3)
##
## Call:
## lm(formula = CA_FIRE_FREQ ~ CA_5yr_SPEI, data = ca_ff_spei)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2325 -1302 -400 633 4754
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8625 374 23.05 <0.0000000000000002 ***
## CA_5yr_SPEI -187 408 -0.46 0.65
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1670 on 25 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.00831, Adjusted R-squared: -0.0314
## F-statistic: 0.209 on 1 and 25 DF, p-value: 0.651
ggplotRegression(fit3)
## Fire area and SPEI Linear Regression
# CA FIRE_AREA & SPEI new df with fire_year and fire area for CA
fires_new_ca_fa <- fires_new_ca %>%
group_by(FIRE_YEAR) %>%
summarise_at(vars(FIRE_SIZE), list(AREA_BURNED_CA = sum))
fires_new_ca_fa
## # A tibble: 27 × 2
## FIRE_YEAR AREA_BURNED_CA
## <fct> <dbl>
## 1 1992 296440.
## 2 1993 321495.
## 3 1994 406965.
## 4 1995 216069.
## 5 1996 707109.
## 6 1997 324566.
## 7 1998 160009.
## 8 1999 804123.
## 9 2000 264671.
## 10 2001 350140.
## # … with 17 more rows
# rename fire_year for merging
fires_new_ca_fa <- rename_all(fires_new_ca_fa, recode, FIRE_YEAR = "Year")
fires_new_ca_fa
## # A tibble: 27 × 2
## Year AREA_BURNED_CA
## <fct> <dbl>
## 1 1992 296440.
## 2 1993 321495.
## 3 1994 406965.
## 4 1995 216069.
## 5 1996 707109.
## 6 1997 324566.
## 7 1998 160009.
## 8 1999 804123.
## 9 2000 264671.
## 10 2001 350140.
## # … with 17 more rows
# merge with Ca-spei
ca_fa_spei <- merge(fires_new_ca_fa, ca_SPEI, by = "Year", all = TRUE)
ca_fa_spei
## Year AREA_BURNED_CA CA_5yr_SPEI
## 1 1992 296440 -1.19
## 2 1993 321495 -0.52
## 3 1994 406965 -0.81
## 4 1995 216069 0.28
## 5 1996 707109 0.36
## 6 1997 324566 0.60
## 7 1998 160009 0.96
## 8 1999 804123 1.25
## 9 2000 264671 0.67
## 10 2001 350140 0.33
## 11 2002 515860 -0.05
## 12 2003 1033343 -0.98
## 13 2004 300885 -1.04
## 14 2005 263962 -0.73
## 15 2006 819106 -0.16
## 16 2007 1061554 -0.26
## 17 2008 1433971 -0.53
## 18 2009 465219 -0.74
## 19 2010 127905 -0.80
## 20 2011 192278 -0.60
## 21 2012 764759 -0.56
## 22 2013 581410 -0.61
## 23 2014 547829 -1.03
## 24 2015 848438 -1.65
## 25 2016 574863 -1.93
## 26 2017 1360160 -1.38
## 27 2018 1635758 -1.45
## 28 2019 NA -0.61
## 29 2020 NA -0.61
## 30 2021 NA -1.10
# linear regression
fit4 <- lm(AREA_BURNED_CA ~ CA_5yr_SPEI, data = ca_fa_spei)
summary(fit4)
##
## Call:
## lm(formula = AREA_BURNED_CA ~ CA_5yr_SPEI, data = ca_fa_spei)
##
## Residuals:
## Min 1Q Median 3Q Max
## -531303 -267673 -131405 245401 874354
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 533429 88710 6.01 0.0000028 ***
## CA_5yr_SPEI -157224 96850 -1.62 0.12
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 397000 on 25 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.0954, Adjusted R-squared: 0.0592
## F-statistic: 2.64 on 1 and 25 DF, p-value: 0.117
ggplotRegression(fit4)
## Large Fires and SPEI Linear Regression
# Number of fires greater than 10,000 acres and the proportion Large fire
# defined as > 10,000 in literature here:
# (https://fireecology.springeropen.com/articles/10.1186/s42408-021-00110-7)
fires_new_ca_lf <- fires_new_ca %>%
group_by(FIRE_YEAR) %>%
summarize(CA_NUM_FIRES = n(), CA_NUM_LARGE_FIRES = sum(FIRE_SIZE > 10000), CA_PCT_LARGE_FIRES = (CA_NUM_LARGE_FIRES/CA_NUM_FIRES) *
100)
fires_new_ca_lf
## # A tibble: 27 × 4
## FIRE_YEAR CA_NUM_FIRES CA_NUM_LARGE_FIRES CA_PCT_LARGE_FIRES
## <fct> <int> <int> <dbl>
## 1 1992 10831 3 0.0277
## 2 1993 8268 6 0.0726
## 3 1994 8649 6 0.0694
## 4 1995 7381 2 0.0271
## 5 1996 9170 14 0.153
## 6 1997 7928 6 0.0757
## 7 1998 6861 2 0.0292
## 8 1999 8909 18 0.202
## 9 2000 6970 3 0.0430
## 10 2001 8163 8 0.0980
## # … with 17 more rows
# rename fire_year for merging
fires_new_ca_lf <- rename_all(fires_new_ca_lf, recode, FIRE_YEAR = "Year")
fires_new_ca_lf
## # A tibble: 27 × 4
## Year CA_NUM_FIRES CA_NUM_LARGE_FIRES CA_PCT_LARGE_FIRES
## <fct> <int> <int> <dbl>
## 1 1992 10831 3 0.0277
## 2 1993 8268 6 0.0726
## 3 1994 8649 6 0.0694
## 4 1995 7381 2 0.0271
## 5 1996 9170 14 0.153
## 6 1997 7928 6 0.0757
## 7 1998 6861 2 0.0292
## 8 1999 8909 18 0.202
## 9 2000 6970 3 0.0430
## 10 2001 8163 8 0.0980
## # … with 17 more rows
# merge with Ca-spei
ca_lf_spei <- merge(fires_new_ca_lf, ca_SPEI, by = "Year", all = TRUE)
ca_lf_spei
## Year CA_NUM_FIRES CA_NUM_LARGE_FIRES CA_PCT_LARGE_FIRES CA_5yr_SPEI
## 1 1992 10831 3 0.028 -1.19
## 2 1993 8268 6 0.073 -0.52
## 3 1994 8649 6 0.069 -0.81
## 4 1995 7381 2 0.027 0.28
## 5 1996 9170 14 0.153 0.36
## 6 1997 7928 6 0.076 0.60
## 7 1998 6861 2 0.029 0.96
## 8 1999 8909 18 0.202 1.25
## 9 2000 6970 3 0.043 0.67
## 10 2001 8163 8 0.098 0.33
## 11 2002 7118 7 0.098 -0.05
## 12 2003 7894 14 0.177 -0.98
## 13 2004 7407 8 0.108 -1.04
## 14 2005 10219 3 0.029 -0.73
## 15 2006 12038 16 0.133 -0.16
## 16 2007 13428 14 0.104 -0.26
## 17 2008 10594 38 0.359 -0.53
## 18 2009 9862 4 0.041 -0.74
## 19 2010 8304 2 0.024 -0.80
## 20 2011 8560 4 0.047 -0.60
## 21 2012 7198 12 0.167 -0.56
## 22 2013 8722 11 0.126 -0.61
## 23 2014 6493 12 0.185 -1.03
## 24 2015 7355 17 0.231 -1.65
## 25 2016 7882 10 0.127 -1.93
## 26 2017 9537 25 0.262 -1.38
## 27 2018 9488 21 0.221 -1.45
## 28 2019 NA NA NA -0.61
## 29 2020 NA NA NA -0.61
## 30 2021 NA NA NA -1.10
# linear regression
fit5 <- lm(CA_PCT_LARGE_FIRES ~ CA_5yr_SPEI, data = ca_lf_spei)
summary(fit5)
##
## Call:
## lm(formula = CA_PCT_LARGE_FIRES ~ CA_5yr_SPEI, data = ca_lf_spei)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11237 -0.05555 -0.00996 0.04670 0.23702
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.1069 0.0186 5.75 0.0000054 ***
## CA_5yr_SPEI -0.0279 0.0203 -1.37 0.18
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.083 on 25 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.0701, Adjusted R-squared: 0.0329
## F-statistic: 1.89 on 1 and 25 DF, p-value: 0.182
ggplotRegression(fit5)
## Multiple Linear Regression
# merge fa,fs,lf
ca_fa_lf_SPEI <- merge(fires_new_ca_fa, ca_lf_spei, by = "Year", all = TRUE)
ca_combined_SPEI <- merge(fires_new_ca_fs, ca_fa_lf_SPEI, by = "Year", all = TRUE)
ca_combined_SPEI
## Year AVG_FIRE_SIZE_CA AREA_BURNED_CA CA_NUM_FIRES CA_NUM_LARGE_FIRES
## 1 1992 27 296440 10831 3
## 2 1993 39 321495 8268 6
## 3 1994 47 406965 8649 6
## 4 1995 29 216069 7381 2
## 5 1996 77 707109 9170 14
## 6 1997 41 324566 7928 6
## 7 1998 23 160009 6861 2
## 8 1999 90 804123 8909 18
## 9 2000 38 264671 6970 3
## 10 2001 43 350140 8163 8
## 11 2002 72 515860 7118 7
## 12 2003 131 1033343 7894 14
## 13 2004 41 300885 7407 8
## 14 2005 26 263962 10219 3
## 15 2006 68 819106 12038 16
## 16 2007 79 1061554 13428 14
## 17 2008 135 1433971 10594 38
## 18 2009 47 465219 9862 4
## 19 2010 15 127905 8304 2
## 20 2011 22 192278 8560 4
## 21 2012 106 764759 7198 12
## 22 2013 67 581410 8722 11
## 23 2014 84 547829 6493 12
## 24 2015 115 848438 7355 17
## 25 2016 73 574863 7882 10
## 26 2017 143 1360160 9537 25
## 27 2018 172 1635758 9488 21
## 28 2019 NA NA NA NA
## 29 2020 NA NA NA NA
## 30 2021 NA NA NA NA
## CA_PCT_LARGE_FIRES CA_5yr_SPEI
## 1 0.028 -1.19
## 2 0.073 -0.52
## 3 0.069 -0.81
## 4 0.027 0.28
## 5 0.153 0.36
## 6 0.076 0.60
## 7 0.029 0.96
## 8 0.202 1.25
## 9 0.043 0.67
## 10 0.098 0.33
## 11 0.098 -0.05
## 12 0.177 -0.98
## 13 0.108 -1.04
## 14 0.029 -0.73
## 15 0.133 -0.16
## 16 0.104 -0.26
## 17 0.359 -0.53
## 18 0.041 -0.74
## 19 0.024 -0.80
## 20 0.047 -0.60
## 21 0.167 -0.56
## 22 0.126 -0.61
## 23 0.185 -1.03
## 24 0.231 -1.65
## 25 0.127 -1.93
## 26 0.262 -1.38
## 27 0.221 -1.45
## 28 NA -0.61
## 29 NA -0.61
## 30 NA -1.10
# ca_combined_SPEI correlation
data(ca_combined_SPEI, package = "mosaicData")
df <- dplyr::select_if(ca_combined_SPEI, is.numeric)
r <- cor(df, use = "complete.obs")
round(r, 2)
## AVG_FIRE_SIZE_CA AREA_BURNED_CA CA_NUM_FIRES
## AVG_FIRE_SIZE_CA 1.00 0.95 0.13
## AREA_BURNED_CA 0.95 1.00 0.40
## CA_NUM_FIRES 0.13 0.40 1.00
## CA_NUM_LARGE_FIRES 0.85 0.90 0.33
## CA_PCT_LARGE_FIRES 0.90 0.86 0.11
## CA_5yr_SPEI -0.36 -0.31 -0.09
## CA_NUM_LARGE_FIRES CA_PCT_LARGE_FIRES CA_5yr_SPEI
## AVG_FIRE_SIZE_CA 0.85 0.90 -0.36
## AREA_BURNED_CA 0.90 0.86 -0.31
## CA_NUM_FIRES 0.33 0.11 -0.09
## CA_NUM_LARGE_FIRES 1.00 0.96 -0.21
## CA_PCT_LARGE_FIRES 0.96 1.00 -0.26
## CA_5yr_SPEI -0.21 -0.26 1.00
ggcorrplot(r, hc.order = TRUE, type = "lower", lab = TRUE)
# Multiple Linear Regression
model <- lm(CA_5yr_SPEI ~ AVG_FIRE_SIZE_CA + AREA_BURNED_CA + CA_NUM_FIRES + CA_NUM_LARGE_FIRES +
CA_PCT_LARGE_FIRES, data = ca_combined_SPEI)
print(model)
##
## Call:
## lm(formula = CA_5yr_SPEI ~ AVG_FIRE_SIZE_CA + AREA_BURNED_CA +
## CA_NUM_FIRES + CA_NUM_LARGE_FIRES + CA_PCT_LARGE_FIRES, data = ca_combined_SPEI)
##
## Coefficients:
## (Intercept) AVG_FIRE_SIZE_CA AREA_BURNED_CA CA_NUM_FIRES
## 3.22289995 -0.01449452 0.00000088 -0.00036370
## CA_NUM_LARGE_FIRES CA_PCT_LARGE_FIRES
## 0.23928900 -21.63298553
summary(model)
##
## Call:
## lm(formula = CA_5yr_SPEI ~ AVG_FIRE_SIZE_CA + AREA_BURNED_CA +
## CA_NUM_FIRES + CA_NUM_LARGE_FIRES + CA_PCT_LARGE_FIRES, data = ca_combined_SPEI)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.383 -0.477 -0.110 0.475 1.932
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.22289995 2.14590187 1.50 0.15
## AVG_FIRE_SIZE_CA -0.01449452 0.05925187 -0.24 0.81
## AREA_BURNED_CA 0.00000088 0.00000649 0.14 0.89
## CA_NUM_FIRES -0.00036370 0.00024626 -1.48 0.15
## CA_NUM_LARGE_FIRES 0.23928900 0.27895819 0.86 0.40
## CA_PCT_LARGE_FIRES -21.63298553 26.62606936 -0.81 0.43
##
## Residual standard error: 0.75 on 21 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.29, Adjusted R-squared: 0.121
## F-statistic: 1.71 on 5 and 21 DF, p-value: 0.175
# Get the Intercept and coefficients as vector elements.
cat("# # # # The Coefficient Values # # # ", "\n")
## # # # # The Coefficient Values # # #
a <- coef(model)[1]
print(a)
## (Intercept)
## 3.2
XAVG_FIRE_SIZE_CA <- coef(model)[2]
XAREA_BURNED_CA <- coef(model)[3]
XCA_NUM_FIRES <- coef(model)[4]
XCA_NUM_LARGE_FIRES <- coef(model)[5]
XCA_PCT_LARGE_FIRES <- coef(model)[6]
print(XAVG_FIRE_SIZE_CA)
## AVG_FIRE_SIZE_CA
## -0.014
print(XAREA_BURNED_CA)
## AREA_BURNED_CA
## 0.00000088
print(XCA_NUM_FIRES)
## CA_NUM_FIRES
## -0.00036
print(XCA_NUM_LARGE_FIRES)
## CA_NUM_LARGE_FIRES
## 0.24
print(XCA_PCT_LARGE_FIRES)
## CA_PCT_LARGE_FIRES
## -22
# Use 2019 - 2021 ca_SPEI values to predict avg fire size
ca_fs_spei
## Year AVG_FIRE_SIZE_CA CA_5yr_SPEI
## 1 1992 27 -1.19
## 2 1993 39 -0.52
## 3 1994 47 -0.81
## 4 1995 29 0.28
## 5 1996 77 0.36
## 6 1997 41 0.60
## 7 1998 23 0.96
## 8 1999 90 1.25
## 9 2000 38 0.67
## 10 2001 43 0.33
## 11 2002 72 -0.05
## 12 2003 131 -0.98
## 13 2004 41 -1.04
## 14 2005 26 -0.73
## 15 2006 68 -0.16
## 16 2007 79 -0.26
## 17 2008 135 -0.53
## 18 2009 47 -0.74
## 19 2010 15 -0.80
## 20 2011 22 -0.60
## 21 2012 106 -0.56
## 22 2013 67 -0.61
## 23 2014 84 -1.03
## 24 2015 115 -1.65
## 25 2016 73 -1.93
## 26 2017 143 -1.38
## 27 2018 172 -1.45
## 28 2019 NA -0.61
## 29 2020 NA -0.61
## 30 2021 NA -1.10
ca_SPEI_new <- data.frame(CA_5yr_SPEI = c(-0.61, -0.61, -1.1))
ca_avg_fs_pred <- data.frame(Year = c(2019, 2020, 2021), AVG_FIRE_SIZE_CA = predict(fit1,
ca_SPEI_new))
ca_avg_fs_pred
## Year AVG_FIRE_SIZE_CA
## 1 2019 71
## 2 2020 71
## 3 2021 81
ca_fs_spei <- ca_fs_spei %>%
mutate(AVG_FIRE_SIZE_CA = if_else(Year == 2019, 71, AVG_FIRE_SIZE_CA))
ca_fs_spei <- ca_fs_spei %>%
mutate(AVG_FIRE_SIZE_CA = if_else(Year == 2020, 71, AVG_FIRE_SIZE_CA))
ca_fs_spei <- ca_fs_spei %>%
mutate(AVG_FIRE_SIZE_CA = if_else(Year == 2021, 81, AVG_FIRE_SIZE_CA))
ca_fs_spei
## Year AVG_FIRE_SIZE_CA CA_5yr_SPEI
## 1 1992 27 -1.19
## 2 1993 39 -0.52
## 3 1994 47 -0.81
## 4 1995 29 0.28
## 5 1996 77 0.36
## 6 1997 41 0.60
## 7 1998 23 0.96
## 8 1999 90 1.25
## 9 2000 38 0.67
## 10 2001 43 0.33
## 11 2002 72 -0.05
## 12 2003 131 -0.98
## 13 2004 41 -1.04
## 14 2005 26 -0.73
## 15 2006 68 -0.16
## 16 2007 79 -0.26
## 17 2008 135 -0.53
## 18 2009 47 -0.74
## 19 2010 15 -0.80
## 20 2011 22 -0.60
## 21 2012 106 -0.56
## 22 2013 67 -0.61
## 23 2014 84 -1.03
## 24 2015 115 -1.65
## 25 2016 73 -1.93
## 26 2017 143 -1.38
## 27 2018 172 -1.45
## 28 2019 71 -0.61
## 29 2020 71 -0.61
## 30 2021 81 -1.10
# Scatter plot with predicted values in red
ca_fs_spei %>%
ggplot(aes(x = CA_5yr_SPEI, y = AVG_FIRE_SIZE_CA)) + geom_point() + geom_point(data = ca_fs_spei[28:30,
], aes(x = CA_5yr_SPEI, y = AVG_FIRE_SIZE_CA), colour = "red", size = 2) + labs(title = "Scatter Plot with Predicted Average Fire Size Values in Red")
ca_fs_spei %>%
ggplot(aes(x = Year, y = AVG_FIRE_SIZE_CA)) + geom_bar(stat = "identity", fill = "grey") +
labs(title = "CA Average Fire Size with Predicted Values") + theme(title = element_text(size = 20),
axis.title.x = element_text(size = 16), axis.text.x = element_text(size = 14),
axis.text.y = element_text(size = 14), axis.title.y = element_text(size = 16))